reddit/reddit.ipynb
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python385jvsc74a57bd031f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6",
"display_name": "Python 3.8.5 64-bit"
},
"metadata": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" created_date created_timestamp subreddit \\\n",
"0 2008-07-06 16:00:14 1.215349e+09 artificial \n",
"1 2008-08-27 16:26:50 1.219844e+09 artificial \n",
"2 2008-10-12 00:29:40 1.223761e+09 artificial \n",
"3 2008-10-12 00:40:40 1.223761e+09 artificial \n",
"4 2008-10-14 20:31:01 1.224005e+09 artificial \n",
"\n",
" title id author \\\n",
"0 Man-Machine Poker (Solaris 2) Results (July 3-... 6qgmm IhateEverything \n",
"1 History of artificial intelligence 6y98d [deleted] \n",
"2 Minsky's Critics, Selectors and Resources at a... 76liu liamQ \n",
"3 The Single Layer Perceptron 76ljt liamQ \n",
"4 Siri Raises $8.5 Million for Personal Artifici... 773i4 CuteAlien \n",
"\n",
" author_created_utc full_link \\\n",
"0 1.198203e+09 https://www.reddit.com/r/artificial/comments/6... \n",
"1 NaN https://www.reddit.com/r/artificial/comments/6... \n",
"2 1.223677e+09 https://www.reddit.com/r/artificial/comments/7... \n",
"3 1.223677e+09 https://www.reddit.com/r/artificial/comments/7... \n",
"4 1.179241e+09 https://www.reddit.com/r/artificial/comments/7... \n",
"\n",
" score num_comments num_crossposts subreddit_subscribers post \n",
"0 4.0 1.0 0.0 NaN NaN \n",
"1 5.0 0.0 0.0 NaN NaN \n",
"2 1.0 0.0 0.0 NaN NaN \n",
"3 2.0 1.0 0.0 NaN NaN \n",
"4 4.0 0.0 0.0 NaN NaN "
],
"text/html": "\n\n
\n\n\n | \ncreated_date | \ncreated_timestamp | \nsubreddit | \ntitle | \nid | \nauthor | \nauthor_created_utc | \nfull_link | \nscore | \nnum_comments | \nnum_crossposts | \nsubreddit_subscribers | \npost | \n
\n\n\n\n0 | \n2008-07-06 16:00:14 | \n1.215349e+09 | \nartificial | \nMan-Machine Poker (Solaris 2) Results (July 3-... | \n6qgmm | \nIhateEverything | \n1.198203e+09 | \nhttps://www.reddit.com/r/artificial/comments/6... | \n4.0 | \n1.0 | \n0.0 | \nNaN | \nNaN | \n
\n\n1 | \n2008-08-27 16:26:50 | \n1.219844e+09 | \nartificial | \nHistory of artificial intelligence | \n6y98d | \n[deleted] | \nNaN | \nhttps://www.reddit.com/r/artificial/comments/6... | \n5.0 | \n0.0 | \n0.0 | \nNaN | \nNaN | \n
\n\n2 | \n2008-10-12 00:29:40 | \n1.223761e+09 | \nartificial | \nMinsky's Critics, Selectors and Resources at a... | \n76liu | \nliamQ | \n1.223677e+09 | \nhttps://www.reddit.com/r/artificial/comments/7... | \n1.0 | \n0.0 | \n0.0 | \nNaN | \nNaN | \n
\n\n3 | \n2008-10-12 00:40:40 | \n1.223761e+09 | \nartificial | \nThe Single Layer Perceptron | \n76ljt | \nliamQ | \n1.223677e+09 | \nhttps://www.reddit.com/r/artificial/comments/7... | \n2.0 | \n1.0 | \n0.0 | \nNaN | \nNaN | \n
\n\n4 | \n2008-10-14 20:31:01 | \n1.224005e+09 | \nartificial | \nSiri Raises $8.5 Million for Personal Artifici... | \n773i4 | \nCuteAlien | \n1.179241e+09 | \nhttps://www.reddit.com/r/artificial/comments/7... | \n4.0 | \n0.0 | \n0.0 | \nNaN | \nNaN | \n
\n\n
\n
"
},
"metadata": {},
"execution_count": 7
}
],
"source": [
"WORK_DIR = 'data/'\n",
"all_data = pd.DataFrame()\n",
"\n",
"for dataset in os.listdir(WORK_DIR):\n",
" all_data = pd.concat([all_data, pd.read_csv(WORK_DIR + dataset, index_col = 0)])\n",
" \n",
"all_data = all_data.reset_index(drop = True)\n",
"all_data['created_date'] = all_data['created_date'].astype('datetime64')\n",
"all_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" created_date created_timestamp subreddit \\\n",
"162616 2020-01-20 16:00:26 1.579529e+09 dataengineering \n",
"748 2013-09-19 05:07:26 1.379556e+09 artificial \n",
"54980 2016-02-07 00:05:10 1.454796e+09 AskStatistics \n",
"270585 2019-07-29 23:45:21 1.564433e+09 learnmachinelearning \n",
"82331 2021-03-14 03:40:36 1.615686e+09 AskStatistics \n",
"\n",
" title id \\\n",
"162616 SAP cloud Data Warehouse erdjne \n",
"748 Markov extension for Chrome 1moo58 \n",
"54980 Can you apply the same statistic to different ... 44ieeg \n",
"270585 How do Histogram of Oriented Gradients descrip... cjh5ay \n",
"82331 The topic is \"level of satisfaction on governm... m4kvs1 \n",
"\n",
" author author_created_utc \\\n",
"162616 Boozmork NaN \n",
"748 EmoryM 1.211877e+09 \n",
"54980 hello30303049 1.454796e+09 \n",
"270585 EverydayQuestion NaN \n",
"82331 pearsonsigma NaN \n",
"\n",
" full_link score \\\n",
"162616 https://www.reddit.com/r/dataengineering/comme... 1.0 \n",
"748 https://www.reddit.com/r/artificial/comments/1... 19.0 \n",
"54980 https://www.reddit.com/r/AskStatistics/comment... 1.0 \n",
"270585 https://www.reddit.com/r/learnmachinelearning/... 1.0 \n",
"82331 https://www.reddit.com/r/AskStatistics/comment... 1.0 \n",
"\n",
" num_comments num_crossposts subreddit_subscribers \\\n",
"162616 6.0 0.0 9356.0 \n",
"748 7.0 NaN NaN \n",
"54980 5.0 NaN NaN \n",
"270585 0.0 0.0 82047.0 \n",
"82331 2.0 0.0 39909.0 \n",
"\n",
" post author_created_date \n",
"162616 Hi Engineers, \\n\\nThe company I work for is at... NaT \n",
"748 I think the results of markov chains are great... 2008-05-27 08:31:47 \n",
"54980 Sorry if my question is worded badly. Here's a... 2016-02-06 22:00:26 \n",
"270585 I'm looking through this tutorial on creating ... NaT \n",
"82331 NaN NaT "
],
"text/html": "\n\n
\n\n\n | \ncreated_date | \ncreated_timestamp | \nsubreddit | \ntitle | \nid | \nauthor | \nauthor_created_utc | \nfull_link | \nscore | \nnum_comments | \nnum_crossposts | \nsubreddit_subscribers | \npost | \nauthor_created_date | \n
\n\n\n\n162616 | \n2020-01-20 16:00:26 | \n1.579529e+09 | \ndataengineering | \nSAP cloud Data Warehouse | \nerdjne | \nBoozmork | \nNaN | \nhttps://www.reddit.com/r/dataengineering/comme... | \n1.0 | \n6.0 | \n0.0 | \n9356.0 | \nHi Engineers, \\n\\nThe company I work for is at... | \nNaT | \n
\n\n748 | \n2013-09-19 05:07:26 | \n1.379556e+09 | \nartificial | \nMarkov extension for Chrome | \n1moo58 | \nEmoryM | \n1.211877e+09 | \nhttps://www.reddit.com/r/artificial/comments/1... | \n19.0 | \n7.0 | \nNaN | \nNaN | \nI think the results of markov chains are great... | \n2008-05-27 08:31:47 | \n
\n\n54980 | \n2016-02-07 00:05:10 | \n1.454796e+09 | \nAskStatistics | \nCan you apply the same statistic to different ... | \n44ieeg | \nhello30303049 | \n1.454796e+09 | \nhttps://www.reddit.com/r/AskStatistics/comment... | \n1.0 | \n5.0 | \nNaN | \nNaN | \nSorry if my question is worded badly. Here's a... | \n2016-02-06 22:00:26 | \n
\n\n270585 | \n2019-07-29 23:45:21 | \n1.564433e+09 | \nlearnmachinelearning | \nHow do Histogram of Oriented Gradients descrip... | \ncjh5ay | \nEverydayQuestion | \nNaN | \nhttps://www.reddit.com/r/learnmachinelearning/... | \n1.0 | \n0.0 | \n0.0 | \n82047.0 | \nI'm looking through this tutorial on creating ... | \nNaT | \n
\n\n82331 | \n2021-03-14 03:40:36 | \n1.615686e+09 | \nAskStatistics | \nThe topic is \"level of satisfaction on governm... | \nm4kvs1 | \npearsonsigma | \nNaN | \nhttps://www.reddit.com/r/AskStatistics/comment... | \n1.0 | \n2.0 | \n0.0 | \n39909.0 | \nNaN | \nNaT | \n
\n\n
\n
"
},
"metadata": {},
"execution_count": 153
}
],
"source": [
"all_data['author_created_date'] = pd.to_datetime(all_data['author_created_utc'], unit='s')\n",
"all_data['author_created_date'].head()\n",
"\n",
"all_data['created_date'] = pd.to_datetime(all_data['created_date'])\n",
"\n",
"all_data = all_data.sample(1000)\n",
"all_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" created_timestamp author_created_utc score num_comments \\\n",
"count 1.000000e+03 2.160000e+02 1000.000000 1000.000000 \n",
"mean 1.539772e+09 1.384704e+09 4.006000 4.005000 \n",
"std 6.565712e+07 8.450600e+07 19.502931 12.309691 \n",
"min 1.249206e+09 1.122350e+09 0.000000 0.000000 \n",
"25% 1.501587e+09 1.332197e+09 1.000000 0.000000 \n",
"50% 1.556755e+09 1.407746e+09 1.000000 1.000000 \n",
"75% 1.591825e+09 1.447657e+09 1.000000 4.000000 \n",
"max 1.615972e+09 1.498049e+09 519.000000 207.000000 \n",
"\n",
" num_crossposts subreddit_subscribers \n",
"count 753.000000 6.920000e+02 \n",
"mean 0.009296 2.463465e+05 \n",
"std 0.109003 3.605919e+05 \n",
"min 0.000000 1.517000e+03 \n",
"25% 0.000000 4.200400e+04 \n",
"50% 0.000000 9.736050e+04 \n",
"75% 0.000000 2.191938e+05 \n",
"max 2.000000 1.736778e+06 "
],
"text/html": "
\n\n
\n\n\n | \ncreated_timestamp | \nauthor_created_utc | \nscore | \nnum_comments | \nnum_crossposts | \nsubreddit_subscribers | \n
\n\n\n\ncount | \n1.000000e+03 | \n2.160000e+02 | \n1000.000000 | \n1000.000000 | \n753.000000 | \n6.920000e+02 | \n
\n\nmean | \n1.539772e+09 | \n1.384704e+09 | \n4.006000 | \n4.005000 | \n0.009296 | \n2.463465e+05 | \n
\n\nstd | \n6.565712e+07 | \n8.450600e+07 | \n19.502931 | \n12.309691 | \n0.109003 | \n3.605919e+05 | \n
\n\nmin | \n1.249206e+09 | \n1.122350e+09 | \n0.000000 | \n0.000000 | \n0.000000 | \n1.517000e+03 | \n
\n\n25% | \n1.501587e+09 | \n1.332197e+09 | \n1.000000 | \n0.000000 | \n0.000000 | \n4.200400e+04 | \n
\n\n50% | \n1.556755e+09 | \n1.407746e+09 | \n1.000000 | \n1.000000 | \n0.000000 | \n9.736050e+04 | \n
\n\n75% | \n1.591825e+09 | \n1.447657e+09 | \n1.000000 | \n4.000000 | \n0.000000 | \n2.191938e+05 | \n
\n\nmax | \n1.615972e+09 | \n1.498049e+09 | \n519.000000 | \n207.000000 | \n2.000000 | \n1.736778e+06 | \n
\n\n
\n
"
},
"metadata": {},
"execution_count": 154
}
],
"source": [
"all_data.describe()\n"
]
},
{
"source": [
"1. Summarize the data (4 points)\n",
"\n",
"## Which subreddit has the most posts (top 5)? ##\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"MachineLearning 225\n",
"statistics 118\n",
"datascience 115\n",
"learnmachinelearning 93\n",
"computerscience 82\n",
"Name: subreddit, dtype: int64"
]
},
"metadata": {},
"execution_count": 155
}
],
"source": [
"all_data['subreddit'].value_counts().head(5)"
]
},
{
"source": [
"## Which user has the most posts (top 5)? ##\n",
" \n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[deleted] 36\n",
"ai_jobs 14\n",
"aijobs-com 8\n",
"Yuqing7 5\n",
"AutoModerator 5\n",
"Name: author, dtype: int64"
]
},
"metadata": {},
"execution_count": 156
}
],
"source": [
"all_data['author'].value_counts().head(5)"
]
},
{
"source": [
"## Which subreddit has the most distinct post authors? ##"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" author\n",
"subreddit \n",
"MachineLearning 209\n",
"statistics 114\n",
"datascience 108\n",
"learnmachinelearning 89\n",
"computerscience 78"
],
"text/html": "
\n\n
\n\n\n | \nauthor | \n
\n\nsubreddit | \n | \n
\n\n\n\nMachineLearning | \n209 | \n
\n\nstatistics | \n114 | \n
\n\ndatascience | \n108 | \n
\n\nlearnmachinelearning | \n89 | \n
\n\ncomputerscience | \n78 | \n
\n\n
\n
"
},
"metadata": {},
"execution_count": 157
}
],
"source": [
"grouped_df = all_data.groupby(\"subreddit\")\n",
"\n",
"grouped_df = grouped_df.agg({\"author\": \"nunique\"})\n",
"grouped_df.sort_values('author', ascending=False).head(5)\n"
]
},
{
"source": [
"## Which subreddit contains the greatest percentage of posts with a post body (i.e. contains a value in the post column)? ##"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" post\n",
"subreddit \n",
"datascienceproject 100.000000\n",
"DataScienceJobs 92.000000\n",
"artificial 78.205128\n",
"data 76.923077\n",
"MachineLearning 69.777778"
],
"text/html": "
\n\n
\n\n\n | \npost | \n
\n\nsubreddit | \n | \n
\n\n\n\ndatascienceproject | \n100.000000 | \n
\n\nDataScienceJobs | \n92.000000 | \n
\n\nartificial | \n78.205128 | \n
\n\ndata | \n76.923077 | \n
\n\nMachineLearning | \n69.777778 | \n
\n\n
\n
"
},
"metadata": {},
"execution_count": 158
}
],
"source": [
"# grouped_df = all_data.groupby(\"subreddit\")\n",
"\n",
"# grouped_df = grouped_df.agg({\"post\": \"isna\"})\n",
"#grouped_df.sort_values('author', ascending=False)\n",
"grouped_df = all_data.groupby(\"subreddit\")\n",
"#all_data['post'].isnull().sum(axis = 0)\n",
"grouped_df = grouped_df.agg({'post': lambda x: x.isnull().sum()*100 / (x.notnull().sum() + x.isnull().sum())})\n",
"grouped_df.sort_values('post', ascending=False).head(5)\n"
]
},
{
"source": [
"## 2. Visualize the data (4 points) ##\n",
"\n",
"### Plot the total number of posts across all subreddits over time (line plot). ### \n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "