Answer To: AIT 580 – Assignment 13 – Data Analysis using Spark and Jupyter Please submit on Blackboard Tool:...
Ximi answered on Jul 29 2021
assignment-13-spark-1-ai1ihxok (1).ipynb
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"wget -q https://repo.vse.gmu.edu/DataScience/apache.access.log"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sys\n",
"import re\n",
"import datetime\n",
"import findspark\n",
"findspark.init('/opt/spark')\n",
"from pyspark.sql import functions as F"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from pyspark import SparkConf, SparkContext\n",
"from pyspark.sql import SQLContext\n",
"sc = SparkContext()\n",
"#sc.stop()\n",
"sqlContext = SQLContext(sc)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"data = sqlContext.read.text('apache.access.log')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1043177\n"
]
}
],
"source": [
"print(data.count())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- value: string (nullable = true)\n",
"\n",
"None\n"
]
}
],
"source": [
"print(data.printSchema())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------------------------------------------------------------------------------------------------------------+\n",
"|value |\n",
"+--------------------------------------------------------------------------------------------------------------------------+\n",
"|in24.inetnebr.com - - [01/Aug/1995:00:00:01 -0400] \"GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt HTTP/1.0\" 200 1839|\n",
"|uplherc.upl.com - - [01/Aug/1995:00:00:07 -0400] \"GET / HTTP/1.0\" 304 0 |\n",
"|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] \"GET /images/ksclogo-medium.gif HTTP/1.0\" 304 0 |\n",
"|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] \"GET /images/MOSAIC-logosmall.gif HTTP/1.0\" 304 0 |\n",
"|uplherc.upl.com - - [01/Aug/1995:00:00:08 -0400] \"GET /images/USA-logosmall.gif HTTP/1.0\" 304 0 |\n",
"|ix-esc-ca2-07.ix.netcom.com - - [01/Aug/1995:00:00:09 -0400] \"GET /images/launch-logo.gif HTTP/1.0\" 200 1713 |\n",
"|uplherc.upl.com - - [01/Aug/1995:00:00:10 -0400] \"GET /images/WORLD-logosmall.gif HTTP/1.0\" 304 0 |\n",
"+--------------------------------------------------------------------------------------------------------------------------+\n",
"only showing top 7 rows\n",
"\n"
]
}
],
"source": [
"data.show(n=7,truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+--------------------------+-----------------------------------------------+------+------------+\n",
"|host |timestamp |path |status|content_size|\n",
"+------------------+--------------------------+-----------------------------------------------+------+------------+\n",
"|in24.inetnebr.com |01/Aug/1995:00:00:01 -0400|/shuttle/missions/sts-68/news/sts-68-mcc-05.txt|200 |1839 |\n",
"|uplherc.upl.com |01/Aug/1995:00:00:07 -0400|/ |304 |0 |\n",
"|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/ksclogo-medium.gif |304 |0 |\n",
"|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/MOSAIC-logosmall.gif |304 |0 |\n",
"|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/USA-logosmall.gif |304 |0 |\n",
"+------------------+--------------------------+-----------------------------------------------+------+------------+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"split_df = data.select(\n",
" # \\s = whitespace char, \\d = digit char [0-9], \\w = word char\n",
" # 'host' field: ([^\\s]+\\s) means take group who DOESN'T begin with whitespace char, and regex stop when it encounters \\s\n",
" F.regexp_extract('value', r'^([^\\s]+\\s)', 1).alias('host'),\n",
" # 'timestamp' field: capture group whose enclosed by bar bracket [] - parenthesis doesn't cover the bar-brack cuz you just want the timestamp.\n",
" # it goes like: \"2-dig/3-alpha/4-dig/2dig:2dig:2dig: -3dig\"\n",
" F.regexp_extract('value', r'^.*\\[(\\d{2}/\\w{3}/\\d{4}:\\d{2}:\\d{2}:\\d{2} -\\d{4})]', 1).alias('timestamp'),\n",
" # 'path' field: ^.*\" = take any char until you hit the double-quote char. \\w+\\s = http request method.\n",
" # Finally, ([^\\s]+)\\s+HTTP = keep extracing all non-whitespace char until you bump into \\s followed up HTTP\n",
" F.regexp_extract('value', r'^.*\"\\w+\\s+([^\\s]+)\\s+HTTP.*\"', 1).alias('path'),\n",
" # 'status' field: http://www.w3schools.com/tags/ref_httpmessages.asp\n",
" F.regexp_extract('value', r'^.*\"\\s+([^\\s]+)', 1).cast('integer').alias('status'),\n",
" # 'content_size' field: the ending series of digits\n",
" F.regexp_extract('value', r'^.*\\s+(\\d+)$', 1).cast('integer').alias('content_size'))\n",
"split_df.show(n=5,truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"cleaned_df = split_df.na.fill({'content_size': 0})"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"month_map = {\n",
" 'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,\n",
" 'Aug':8, 'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12\n",
"}\n",
"\n",
"def parse_clf_time(s):\n",
" \"\"\" Convert Common Log time format into a Python datetime object\n",
" Args:\n",
" s (str): date and time in Apache time format [dd/mmm/yyyy:hh:mm:ss (+/-)zzzz]\n",
" Returns:\n",
" a string suitable for passing to CAST('timestamp')\n",
" \"\"\"\n",
" # NOTE: We're ignoring time zone here. In a production application, you'd want to handle that.\n",
" return \"{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}\".format(\n",
" int(s[7:11]),\n",
" month_map[s[3:6]],\n",
" int(s[0:2]),\n",
" int(s[12:14]),\n",
" int(s[15:17]),\n",
" int(s[18:20])\n",
" )\n",
"\n",
"u_parse_time = F.udf(parse_clf_time)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Column\n"
]
}
],
"source": [
"col_to_append = (u_parse_time(cleaned_df['timestamp'])\n",
" .cast('timestamp') # convert column type. https://wtak23.github.io/pyspark/generated/generated/sql.Column.cast.html\n",
" .alias('time') # rename\n",
" )\n",
"print(col_to_append)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+--------------------------+-----------------------------------------------+------+------------+---------------------+\n",
"|host |timestamp |path |status|content_size|time |\n",
"+------------------+--------------------------+-----------------------------------------------+------+------------+---------------------+\n",
"|in24.inetnebr.com |01/Aug/1995:00:00:01 -0400|/shuttle/missions/sts-68/news/sts-68-mcc-05.txt|200 |1839 |1995-08-01 00:00:01.0|\n",
"|uplherc.upl.com |01/Aug/1995:00:00:07 -0400|/ |304 |0 |1995-08-01 00:00:07.0|\n",
"|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/ksclogo-medium.gif |304 |0 |1995-08-01 00:00:08.0|\n",
"|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/MOSAIC-logosmall.gif |304 |0 |1995-08-01 00:00:08.0|\n",
"|uplherc.upl.com |01/Aug/1995:00:00:08 -0400|/images/USA-logosmall.gif |304 |0 |1995-08-01 00:00:08.0|\n",
"+------------------+--------------------------+-----------------------------------------------+------+------------+---------------------+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"# now append column to our parsed, cleaned dataframe\n",
"logs_df = cleaned_df.select('*', col_to_append)\n",
"logs_df.show(n=5,truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------+-----------------------------------------------+------+------------+---------------------+\n",
"|host |path |status|content_size|time |\n",
"+------------------+-----------------------------------------------+------+------------+---------------------+\n",
"|in24.inetnebr.com |/shuttle/missions/sts-68/news/sts-68-mcc-05.txt|200 |1839 |1995-08-01 00:00:01.0|\n",
"|uplherc.upl.com |/ |304 |0 |1995-08-01 00:00:07.0|\n",
"|uplherc.upl.com |/images/ksclogo-medium.gif |304 |0 |1995-08-01 00:00:08.0|\n",
"|uplherc.upl.com |/images/MOSAIC-logosmall.gif |304 |0 |1995-08-01 00:00:08.0|\n",
"|uplherc.upl.com |/images/USA-logosmall.gif |304 |0 |1995-08-01 00:00:08.0|\n",
"+------------------+-----------------------------------------------+------+------------+---------------------+\n",
"only showing top 5 rows\n",
"\n"
]
}
],
"source": [
"logs_df = logs_df.drop('timestamp')\n",
"logs_df.show(n=5,truncate=False)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1043177\n"
]
}
],
"source": [
"total_log_entries = logs_df.count()\n",
"print(total_log_entries)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- host: string (nullable = true)\n",
" |-- path: string (nullable = true)\n",
" |-- status: integer (nullable = true)\n",
" |-- content_size: integer (nullable = false)\n",
" |-- time: timestamp (nullable = true)\n",
"\n"
]
}
],
"source": [
"logs_df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+--------------------+------+------------+--------------------+\n",
"| host| path|status|content_size| time|\n",
"+--------------------+--------------------+------+------------+--------------------+\n",
"| in24.inetnebr.com |/shuttle/missions...| 200| 1839|1995-08-01 00:00:...|\n",
"| uplherc.upl.com | /| 304| 0|1995-08-01 00:00:...|\n",
"| uplherc.upl.com |/images/ksclogo-m...| 304| 0|1995-08-01 00:00:...|\n",
"| uplherc.upl.com |/images/MOSAIC-lo...| 304| 0|1995-08-01 00:00:...|\n",
"| uplherc.upl.com |/images/USA-logos...| 304| 0|1995-08-01 00:00:...|\n",
"|ix-esc-ca2-07.ix....|/images/launch-lo...| 200| 1713|1995-08-01 00:00:...|\n",
"| uplherc.upl.com |/images/WORLD-log...| 304| 0|1995-08-01 00:00:...|\n",
"|slppp6.intermind....|/history/skylab/s...| 200| 1687|1995-08-01 00:00:...|\n",
"|piweba4y.prodigy....|/images/launchmed...| 200| 11853|1995-08-01 00:00:...|\n",
"|slppp6.intermind....|/history/skylab/s...| 200| 9202|1995-08-01 00:00:...|\n",
"|slppp6.intermind....|/images/ksclogosm...| 200| 3635|1995-08-01 00:00:...|\n",
"|ix-esc-ca2-07.ix....|/history/apollo/i...| 200| 1173|1995-08-01 00:00:...|\n",
"|slppp6.intermind....|/history/apollo/i...| 200| 3047|1995-08-01 00:00:...|\n",
"| uplherc.upl.com |/images/NASA-logo...| 304| 0|1995-08-01 00:00:...|\n",
"| 133.43.96.45 |/shuttle/missions...| 200| 10566|1995-08-01 00:00:...|\n",
"|kgtyk4.kj.yamagat...| /| 200| 7280|1995-08-01 00:00:...|\n",
"|kgtyk4.kj.yamagat...|/images/ksclogo-m...| 200| 5866|1995-08-01 00:00:...|\n",
"| d0ucr6.fnal.gov |/history/apollo/a...| 200| 2743|1995-08-01 00:00:...|\n",
"|ix-esc-ca2-07.ix....|/shuttle/resource...| 200| 6849|1995-08-01 00:00:...|\n",
"| d0ucr6.fnal.gov |/history/apollo/a...| 200| 14897|1995-08-01 00:00:...|\n",
"+--------------------+--------------------+------+------------+--------------------+\n",
"only showing top 20 rows\n",
"\n"
]
}
],
"source": [
"logs_df.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Write your code below. \n",
"##### Few hints\n",
"##### 1. Use logs_df to answer the questions.\n",
"##### 2. For visualization, convert your data to Pasdas DataFrame first.\n",
"##### -----"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Q1: Show the content_size Statistics using describe method (5 points)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+------------------+\n",
"|summary| content_size|\n",
"+-------+------------------+\n",
"| count| 1043177|\n",
"| mean|17531.555702435926|\n",
"| stddev| 68561.9990626412|\n",
"| min| 0|\n",
"| max| 3421948|\n",
"+-------+------------------+\n",
"\n"
]
}
],
"source": [
"logs_df.describe('content_size').show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Q2: Count each status (hint: Use groupby and count). Also sort the output by status...