Answer To: Assignment 4 - Spark ML¶ Learning Outcomes¶ In this assignment you will: · Use ML piplenes · Improve...
Abr Writing answered on Mar 08 2021
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "ykBV4nGNkZcQ"
},
"source": [
"Importing the necessary libraries"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ywM9xDqWkZM1"
},
"outputs": [],
"source": [
"from pyspark import SparkContext\n",
"from pyspark.sql import SQLContext\n",
"\n",
"from pyspark.ml.feature import *\n",
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.regression import *\n",
"from pyspark.ml.evaluation import *\n",
"from pyspark.ml.evaluation import *\n",
"from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
"\n",
"# Initializing the spark\n",
"sc = SparkContext(appName='Diamonds')\n",
"sqlContext = SQLContext(sc)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "sy6yx7k0jhL_"
},
"source": [
"# Question 1"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "4z1R7c0ek9Ao"
},
"source": [
"Loading the diamonds dataset as Spark Dataframe and displaying top row"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 124
},
"colab_type": "code",
"id": "QUK06JwhjgsA",
"outputId": "dc37c924-6641-490f-8217-32ded68a6654"
},
"outputs": [
{
"data": {
"text/plain": [
"[Row(carat=0.23, cut='Ideal', color='E', clarity='SI2', depth=61.5, table=55.0, price=326, x=3.95, y=3.98, z=2.43),\n",
" Row(carat=0.21, cut='Premium', color='E', clarity='SI1', depth=59.8, table=61.0, price=326, x=3.89, y=3.84, z=2.31),\n",
" Row(carat=0.23, cut='Good', color='E', clarity='VS1', depth=56.9, table=65.0, price=327, x=4.05, y=4.07, z=2.31),\n",
" Row(carat=0.29, cut='Premium', color='I', clarity='VS2', depth=62.4, table=58.0, price=334, x=4.2, y=4.23, z=2.63),\n",
" Row(carat=0.31, cut='Good', color='J', clarity='SI2', depth=63.3, table=58.0, price=335, x=4.34, y=4.35, z=2.75)]"
]
},
"execution_count": 5,
"metadata": {
"tags": []
},
"output_type": "execute_result"
}
],
"source": [
"diamonds = (sqlContext.read.format(\"com.databricks.spark.csv\")\n",
" .option(\"header\",\"true\")\n",
" .option(\"inferSchema\", \"true\")\n",
" .load('diamonds.csv'))\n",
"\n",
"diamonds.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "hhd41ewHzR7L"
},
"source": [
"Printing the data schema"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 225
},
"colab_type": "code",
"id": "QyhE3MPJzSbP",
"outputId": "985a0884-63c4-434d-80c5-1d57d4e53623"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- carat: double (nullable = true)\n",
" |-- cut: string (nullable = true)\n",
" |-- color: string (nullable = true)\n",
" |-- clarity: string (nullable = true)\n",
" |-- depth: double (nullable = true)\n",
" |-- table: double (nullable = true)\n",
" |-- price: integer (nullable = true)\n",
" |-- x: double (nullable = true)\n",
" |-- y: double (nullable = true)\n",
" |-- z: double (nullable = true)\n",
"\n"
]
}
],
"source": [
"diamonds.printSchema()"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "VWjwvYxl6mp0"
},
"source": [
"## Random Forest Regressor"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 52
},
"colab_type": "code",
"id": "BKpCpFzE7yaH",
"outputId": "75ce2836-10b9-48be-99ed-0d5652da7a79"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3 categorical features\n",
"6 numerical features\n"
]
}
],
"source": [
"cat_cols = [item[0] for item in diamonds.dtypes if item[1].startswith('string')] \n",
"print(str(len(cat_cols)) + ' categorical features')\n",
"num_cols = [item[0] for item in diamonds.dtypes if...