I've sent a screenshot of the assignment details in my notepad. (Screenshot 15)

1 answer below ยป
I've sent a screenshot of the assignment details in my notepad. (Screenshot 15)
Answered Same DayApr 29, 2021

Answer To: I've sent a screenshot of the assignment details in my notepad. (Screenshot 15)

Sandeep Kumar answered on Apr 30 2021
159 Votes
reddit/reddit.ipynb
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
},
"orig_nbformat": 2,
"kernelspec": {
"name": "python385jvsc74a57bd031f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6",
"display_name": "Python 3.8.5 64-bit"
},
"metadata": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" created_date created_timestamp subreddit \\\n",
"0 2008-07-06 16:00:14 1.215349e+09 artificial \n",
"1 2008-08-27 16:26:50 1.219844e+09 artificial \n",
"2 2008-10-12 00:29:40 1.223761e+09 artificial \n",
"3 2008-10-12 00:40:40 1.223761e+09 artificial \n",
"4 2008-10-14 20:31:01 1.224005e+09 artificial \n",
"\n",
" title id author \\\n",
"0 Man-Machine Poker (Solaris 2) Results (July 3-... 6qgmm IhateEverything \n",
"1 History of artificial intelligence 6y98d [deleted] \n",
"2 Minsky's Critics, Selectors and Resources at a... 76liu liamQ \n",
"3 The Single Layer Perceptron 76ljt liamQ \n",
"4 Siri Raises $8.5 Million for Personal Artifici... 773i4 CuteAlien \n",
"\n",
" author_created_utc full_link \\\n",
"0 1.198203e+09 https://www.reddit.com/r/artificial/comments/6... \n",
"1 NaN https://www.reddit.com/r/artificial/comments/6... \n",
"2 1.223677e+09 https://www.reddit.com/r/artificial/comments/7... \n",
"3 1.223677e+09 https://www.reddit.com/r/artificial/comments/7... \n",
"4 1.179241e+09 https://www.reddit.com/r/artificial/comments/7... \n",
"\n",
" score num_comments num_crossposts subreddit_subscribers post \n",
"0 4.0 1.0 0.0 NaN NaN \n",
"1 5.0 0.0 0.0 NaN NaN \n",
"2 1.0 0.0 0.0 NaN NaN \n",
"3 2.0 1.0 0.0 NaN NaN \n",
"4 4.0 0.0 0.0 NaN NaN "
],
"text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
created_datecreated_timestampsubreddittitleidauthorauthor_created_utcfull_linkscorenum_commentsnum_crosspostssubreddit_subscriberspost
02008-07-06 16:00:141.215349e+09artificialMan-Machine Poker (Solaris 2) Results (July 3-...6qgmmIhateEverything1.198203e+09https://www.reddit.com/r/artificial/comments/6...4.01.00.0NaNNaN
12008-08-27 16:26:501.219844e+09artificialHistory of artificial intelligence6y98d[deleted]NaNhttps://www.reddit.com/r/artificial/comments/6...5.00.00.0NaNNaN
22008-10-12 00:29:401.223761e+09artificialMinsky's Critics, Selectors and Resources at a...76liuliamQ1.223677e+09https://www.reddit.com/r/artificial/comments/7...1.00.00.0NaNNaN
32008-10-12 00:40:401.223761e+09artificialThe Single Layer Perceptron76ljtliamQ1.223677e+09https://www.reddit.com/r/artificial/comments/7...2.01.00.0NaNNaN
42008-10-14 20:31:011.224005e+09artificialSiri Raises $8.5 Million for Personal Artifici...773i4CuteAlien1.179241e+09https://www.reddit.com/r/artificial/comments/7...4.00.00.0NaNNaN
\n
"
},
"metadata": {},
"execution_count": 7
}
],
"source": [
"WORK_DIR = 'data/'\n",
"all_data = pd.DataFrame()\n",
"\n",
"for dataset in os.listdir(WORK_DIR):\n",
" all_data = pd.concat([all_data, pd.read_csv(WORK_DIR + dataset, index_col = 0)])\n",
" \n",
"all_data = all_data.reset_index(drop = True)\n",
"all_data['created_date'] = all_data['created_date'].astype('datetime64')\n",
"all_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" created_date created_timestamp subreddit \\\n",
"162616 2020-01-20 16:00:26 1.579529e+09 dataengineering \n",
"748 2013-09-19 05:07:26 1.379556e+09 artificial \n",
"54980 2016-02-07 00:05:10 1.454796e+09 AskStatistics \n",
"270585 2019-07-29 23:45:21 1.564433e+09 learnmachinelearning \n",
"82331 2021-03-14 03:40:36 1.615686e+09 AskStatistics \n",
"\n",
" title id \\\n",
"162616 SAP cloud Data Warehouse erdjne \n",
"748 Markov extension for Chrome 1moo58 \n",
"54980 Can you apply the same statistic to different ... 44ieeg \n",
"270585 How do Histogram of Oriented Gradients descrip... cjh5ay \n",
"82331 The topic is \"level of satisfaction on governm... m4kvs1 \n",
"\n",
" author author_created_utc \\\n",
"162616 Boozmork NaN \n",
"748 EmoryM 1.211877e+09 \n",
"54980 hello30303049 1.454796e+09 \n",
"270585 EverydayQuestion NaN \n",
"82331 pearsonsigma NaN \n",
"\n",
" full_link score \\\n",
"162616 https://www.reddit.com/r/dataengineering/comme... 1.0 \n",
"748 https://www.reddit.com/r/artificial/comments/1... 19.0 \n",
"54980 https://www.reddit.com/r/AskStatistics/comment... 1.0 \n",
"270585 https://www.reddit.com/r/learnmachinelearning/... 1.0 \n",
"82331 https://www.reddit.com/r/AskStatistics/comment... 1.0 \n",
"\n",
" num_comments num_crossposts subreddit_subscribers \\\n",
"162616 6.0 0.0 9356.0 \n",
"748 7.0 NaN NaN \n",
"54980 5.0 NaN NaN \n",
"270585 0.0 0.0 82047.0 \n",
"82331 2.0 0.0 39909.0 \n",
"\n",
" post author_created_date \n",
"162616 Hi Engineers, \\n\\nThe company I work for is at... NaT \n",
"748 I think the results of markov chains are great... 2008-05-27 08:31:47 \n",
"54980 Sorry if my question is worded badly. Here's a... 2016-02-06 22:00:26 \n",
"270585 I'm looking through this tutorial on creating ... NaT \n",
"82331 NaN NaT "
],
"text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
created_datecreated_timestampsubreddittitleidauthorauthor_created_utcfull_linkscorenum_commentsnum_crosspostssubreddit_subscriberspostauthor_created_date
1626162020-01-20 16:00:261.579529e+09dataengineeringSAP cloud Data WarehouseerdjneBoozmorkNaNhttps://www.reddit.com/r/dataengineering/comme...1.06.00.09356.0Hi Engineers, \\n\\nThe company I work for is at...NaT
7482013-09-19 05:07:261.379556e+09artificialMarkov extension for Chrome1moo58EmoryM1.211877e+09https://www.reddit.com/r/artificial/comments/1...19.07.0NaNNaNI think the results of markov chains are great...2008-05-27 08:31:47
549802016-02-07 00:05:101.454796e+09AskStatisticsCan you apply the same statistic to different ...44ieeghello303030491.454796e+09https://www.reddit.com/r/AskStatistics/comment...1.05.0NaNNaNSorry if my question is worded badly. Here's a...2016-02-06 22:00:26
2705852019-07-29 23:45:211.564433e+09learnmachinelearningHow do Histogram of Oriented Gradients descrip...cjh5ayEverydayQuestionNaNhttps://www.reddit.com/r/learnmachinelearning/...1.00.00.082047.0I'm looking through this tutorial on creating ...NaT
823312021-03-14 03:40:361.615686e+09AskStatisticsThe topic is \"level of satisfaction on governm...m4kvs1pearsonsigmaNaNhttps://www.reddit.com/r/AskStatistics/comment...1.02.00.039909.0NaNNaT
\n
"
},
"metadata": {},
"execution_count": 153
}
],
"source": [
"all_data['author_created_date'] = pd.to_datetime(all_data['author_created_utc'], unit='s')\n",
"all_data['author_created_date'].head()\n",
"\n",
"all_data['created_date'] = pd.to_datetime(all_data['created_date'])\n",
"\n",
"all_data = all_data.sample(1000)\n",
"all_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" created_timestamp author_created_utc score num_comments \\\n",
"count 1.000000e+03 2.160000e+02 1000.000000 1000.000000 \n",
"mean 1.539772e+09 1.384704e+09 4.006000 4.005000 \n",
"std 6.565712e+07 8.450600e+07 19.502931 12.309691 \n",
"min 1.249206e+09 1.122350e+09 0.000000 0.000000 \n",
"25% 1.501587e+09 1.332197e+09 1.000000 0.000000 \n",
"50% 1.556755e+09 1.407746e+09 1.000000 1.000000 \n",
"75% 1.591825e+09 1.447657e+09 1.000000 4.000000 \n",
"max 1.615972e+09 1.498049e+09 519.000000 207.000000 \n",
"\n",
" num_crossposts subreddit_subscribers \n",
"count 753.000000 6.920000e+02 \n",
"mean 0.009296 2.4
63465e+05 \n",
"std 0.109003 3.605919e+05 \n",
"min 0.000000 1.517000e+03 \n",
"25% 0.000000 4.200400e+04 \n",
"50% 0.000000 9.736050e+04 \n",
"75% 0.000000 2.191938e+05 \n",
"max 2.000000 1.736778e+06 "
],
"text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
created_timestampauthor_created_utcscorenum_commentsnum_crosspostssubreddit_subscribers
count1.000000e+032.160000e+021000.0000001000.000000753.0000006.920000e+02
mean1.539772e+091.384704e+094.0060004.0050000.0092962.463465e+05
std6.565712e+078.450600e+0719.50293112.3096910.1090033.605919e+05
min1.249206e+091.122350e+090.0000000.0000000.0000001.517000e+03
25%1.501587e+091.332197e+091.0000000.0000000.0000004.200400e+04
50%1.556755e+091.407746e+091.0000001.0000000.0000009.736050e+04
75%1.591825e+091.447657e+091.0000004.0000000.0000002.191938e+05
max1.615972e+091.498049e+09519.000000207.0000002.0000001.736778e+06
\n
"
},
"metadata": {},
"execution_count": 154
}
],
"source": [
"all_data.describe()\n"
]
},
{
"source": [
"1. Summarize the data (4 points)\n",
"\n",
"## Which subreddit has the most posts (top 5)? ##\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 155,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"MachineLearning 225\n",
"statistics 118\n",
"datascience 115\n",
"learnmachinelearning 93\n",
"computerscience 82\n",
"Name: subreddit, dtype: int64"
]
},
"metadata": {},
"execution_count": 155
}
],
"source": [
"all_data['subreddit'].value_counts().head(5)"
]
},
{
"source": [
"## Which user has the most posts (top 5)? ##\n",
" \n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"[deleted] 36\n",
"ai_jobs 14\n",
"aijobs-com 8\n",
"Yuqing7 5\n",
"AutoModerator 5\n",
"Name: author, dtype: int64"
]
},
"metadata": {},
"execution_count": 156
}
],
"source": [
"all_data['author'].value_counts().head(5)"
]
},
{
"source": [
"## Which subreddit has the most distinct post authors? ##"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" author\n",
"subreddit \n",
"MachineLearning 209\n",
"statistics 114\n",
"datascience 108\n",
"learnmachinelearning 89\n",
"computerscience 78"
],
"text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
author
subreddit
MachineLearning209
statistics114
datascience108
learnmachinelearning89
computerscience78
\n
"
},
"metadata": {},
"execution_count": 157
}
],
"source": [
"grouped_df = all_data.groupby(\"subreddit\")\n",
"\n",
"grouped_df = grouped_df.agg({\"author\": \"nunique\"})\n",
"grouped_df.sort_values('author', ascending=False).head(5)\n"
]
},
{
"source": [
"## Which subreddit contains the greatest percentage of posts with a post body (i.e. contains a value in the post column)? ##"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" post\n",
"subreddit \n",
"datascienceproject 100.000000\n",
"DataScienceJobs 92.000000\n",
"artificial 78.205128\n",
"data 76.923077\n",
"MachineLearning 69.777778"
],
"text/html": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
post
subreddit
datascienceproject100.000000
DataScienceJobs92.000000
artificial78.205128
data76.923077
MachineLearning69.777778
\n
"
},
"metadata": {},
"execution_count": 158
}
],
"source": [
"# grouped_df = all_data.groupby(\"subreddit\")\n",
"\n",
"# grouped_df = grouped_df.agg({\"post\": \"isna\"})\n",
"#grouped_df.sort_values('author', ascending=False)\n",
"grouped_df = all_data.groupby(\"subreddit\")\n",
"#all_data['post'].isnull().sum(axis = 0)\n",
"grouped_df = grouped_df.agg({'post': lambda x: x.isnull().sum()*100 / (x.notnull().sum() + x.isnull().sum())})\n",
"grouped_df.sort_values('post', ascending=False).head(5)\n"
]
},
{
"source": [
"## 2. Visualize the data (4 points) ##\n",
"\n",
"### Plot the total number of posts across all subreddits over time (line plot). ### \n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "
",
"image/svg+xml": "\n\n\n\n \n\n\n\n2021-04-30T13:06:55.222331\nimage/svg+xml\n\n\nMatplotlib v3.3.4, https://matplotlib.org/\n\n\n\n\n \n \n\n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n \n\n\n\n \n\n",
"image/png": "\n"
},
"metadata": {
"needs_background": "light"
}
}
],
"source": [
"plt.rcParams[\"figure.figsize\"]=30,20\n",
"plt.plot(all_data['subreddit'].value_counts())\n",
"plt.show()"
]
},
{
"source": [
"### Plot a histogram showing the distribution of post scores. ### \n",
"\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "
",
"image/svg+xml": "\n\n\n\n \n\n\n\n2021-04-30T13:06:56.111341\nimage/svg+xml\n\n\nMatplotlib v3.3.4, https://matplotlib.org/\n\n\n\n\n \n \n\n \n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n \n\n\n\n \n\n",
"image/png":...
SOLUTION.PDF

Answer To This Question Is Available To Download

Related Questions & Answers

More Questions ยป

Submit New Assignment

Copy and Paste Your Assignment Here