Answer To: Apache Spark: Fit a Binary Logistic Regression Model to a Dataset Dataset:Dropbox link for...
Ximi answered on Nov 03 2021
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "spark-grey.ipynb",
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "YdjxUFOKgbPX",
"colab_type": "code",
"colab": {}
},
"source": [
"import pandas\n",
"!apt-get install openjdk-8-jdk-headless -qq > /dev/null\n",
"!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz\n",
"!tar xvf spark-2.4.4-bin-hadoop2.7.tgz\n",
"!pip install -q findspark"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "TDbXbmKege-K",
"colab_type": "code",
"colab": {}
},
"source": [
"import os\n",
"os.environ[\"JAVA_HOME\"] = \"/usr/lib/jvm/java-8-openjdk-amd64\"\n",
"os.environ[\"SPARK_HOME\"] = \"/content/spark-2.4.4-bin-hadoop2.7\""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "g9YMz0J4g0fS",
"colab_type": "code",
"colab": {}
},
"source": [
"import findspark\n",
"findspark.init()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "SXFqUHmxoK0e",
"colab_type": "code",
"colab": {}
},
"source": [
"from pyspark.sql import SparkSession\n",
"spark = SparkSession.builder.master(\"local[*]\").getOrCreate()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "Ed_uGjoWoZ58",
"colab_type": "code",
"colab": {}
},
"source": [
"sc = spark.sparkContext\n",
"from pyspark.sql import SQLContext\n",
"sqlContext = SQLContext(sc)\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "qMQhqdhHobWW",
"colab_type": "code",
"colab": {}
},
"source": [
"import glob\n",
"# List all *.parquet files\n",
"files = glob.glob('*.parquet')"
],
"execution_count": 0,
"outputs": []
},
{
...