{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "1Bs1innK5QPb"
},
"source": [
"# Chapter 9 - Data Science"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#To change date format\n",
"#Amercian date format is mm/dd/yyyy. I have changed it.\n",
"#I have made the negative campaign duration to postitive.\n",
"#And I have added the day of the weeks."
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "5pNpI9xW5QPc"
},
"source": [
"## 0 - Setting up the notebook"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 406
},
"colab_type": "code",
"id": "Mp3qnAoS5QPe",
"outputId": "5ac5685b-b5ff-4635-dcbb-7986b1e3d1cb"
},
"outputs": [],
"source": [
"#!pip install faker\n",
"#!pip install delorean\n",
"\n",
"import json\n",
"import calendar\n",
"import random\n",
"from datetime import date, timedelta, datetime\n",
"import faker\n",
"import numpy as np\n",
"from pandas import DataFrame\n",
"from delorean import parse\n",
"import pandas as pd\n",
"\n",
"# make the graphs nicer\n",
"import matplotlib.pyplot as plt\n",
"plt.style.use('ggplot')\n",
"# see all available with: print(plt.style.available)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "wegaBXNB5QPg"
},
"source": [
"## 1 - Preparing the Data"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "s_DxNSf55QPh"
},
"outputs": [],
"source": [
"# create the faker to populate the data\n",
"fake = faker.Faker()"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "irfx4AJr5QPj"
},
"outputs": [],
"source": [
"usernames = set()\n",
"usernames_no = 1000\n",
"\n",
"# populate the set with 1000 unique usernames\n",
"while len(usernames) < usernames_no:\n",
" usernames.add(fake.user_name())"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 87
},
"colab_type": "code",
"id": "_dtbclPG5QPm",
"outputId": "6315de4b-14a2-4927-ae17-b6db51390ff9"
},
"outputs": [
{
"data": {
"text/plain": [
"['{\"username\": \"ashley97\", \"name\": \"Miranda Cobb\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 35, \"address\": \"00037 Bowers Mall Apt. 791\\\\nHuangfurt, CA 81423\"}',\n",
" '{\"username\": \"lstewart\", \"name\": \"Julie Harris\", \"gender\": \"F\", \"email\": \"[email protected]\", \"age\": 83, \"address\": \"83858 Jones Streets Suite 212\\\\nSarahburgh, SD 84019\"}',\n",
" '{\"username\": \"sharon85\", \"name\": \"Christopher Blevins\", \"gender\": \"M\", \"email\": \"[email protected]\", \"age\": 62, \"address\": \"1266 Alice Pike\\\\nNew Jay, MI 30538\"}']"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def get_random_name_and_gender():\n",
" skew = .6 # 60% of users will be female\n",
" male = random.random() > skew\n",
" if male:\n",
" return fake.name_male(), 'M'\n",
" else:\n",
" return fake.name_female(), 'F'\n",
"\n",
"# for each username, create a complete user profile\n",
"# simulate user data coming from an API. It is a list\n",
"# of JSON strings (users).\n",
"def get_users(usernames):\n",
" users = []\n",
" for username in usernames:\n",
" name, gender = get_random_name_and_gender()\n",
" user = {\n",
" 'username': username,\n",
" 'name': name,\n",
" 'gender': gender,\n",
" 'email': fake.email(),\n",
" 'age': fake.random_int(min=18, max=90),\n",
" 'address': fake.address(),\n",
" }\n",
" users.append(json.dumps(user))\n",
" return users\n",
"\n",
"users = get_users(usernames)\n",
"users[:3]"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "3cEb9cLG5QPo"
},
"outputs": [],
"source": [
"# campaign name format:\n",
"# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency\n",
"def get_type():\n",
" # just some gibberish internal codes\n",
" types = ['AKX', 'BYU', 'GRZ', 'KTR']\n",
" return random.choice(types)\n",
"\n",
"def get_start_end_dates():\n",
" duration = random.randint(1, 2 * 365)\n",
" offset = random.randint(-365, 365)\n",
" start = date.today() - timedelta(days=offset)\n",
" end = start + timedelta(days=duration)\n",
" \n",
" def _format_date(date_):\n",
" return date_.strftime(\"%Y%m%d\") \n",
" \n",
" return _format_date(start), _format_date(end)\n",
"\n",
"def get_age():\n",
" age = random.randint(20, 45)\n",
" age -= age % 5\n",
" diff = random.randint(5, 25)\n",
" diff -= diff % 5\n",
" return '{}-{}'.format(age, age + diff)\n",
"\n",
"def get_gender():\n",
" return random.choice(('M', 'F', 'B'))\n",
"\n",
"def get_currency():\n",
" return random.choice(('GBP', 'EUR', 'USD'))\n",
"\n",
"def get_campaign_name():\n",
" separator = '_'\n",
" type_ = get_type()\n",
" start_end = separator.join(get_start_end_dates())\n",
" age = get_age()\n",
" gender = get_gender()\n",
" currency = get_currency()\n",
" return separator.join(\n",
" (type_, start_end, age, gender, currency))"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "ehc8mWDw5QPq"
},
"outputs": [],
"source": [
"# campaign data:\n",
"# name, budget, spent, clicks, impressions\n",
"def get_campaign_data():\n",
" name = get_campaign_name()\n",
" budget = random.randint(10**3, 10**6)\n",
" spent = random.randint(10**2, budget) \n",
" clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5)) \n",
" impressions = int(random.gauss(0.5 * 10**6, 2))\n",
" return {\n",
" 'cmp_name': name,\n",
" 'cmp_bgt': budget,\n",
" 'cmp_spent': spent,\n",
" 'cmp_clicks': clicks,\n",
" 'cmp_impr': impressions\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "FvtOrJPw5QPs"
},
"outputs": [],
"source": [
"# assemble the logic to get the final version of the rough data\n",
"# data will be a list of dictionaries. Each dictionary will follow\n",
"# this structure:\n",
"# {'user': user_json, 'campaigns': [c1, c2, ...]}\n",
"# where user_json is the JSON string version of a user data dict\n",
"# and c1, c2, ... are campaign dicts as returned by\n",
"# get_campaign_data\n",
"\n",
"def get_data(users):\n",
" data = []\n",
" for user in users:\n",
" campaigns = [get_campaign_data()\n",
" for _ in range(random.randint(2, 8))]\n",
" data.append({'user': user, 'campaigns': campaigns})\n",
" return data"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "Ux9VN0Q15QPu"
},
"source": [
"## 2 - Cleaning the data"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 994
},
"colab_type": "code",
"id": "7kxPpJCh5QPv",
"outputId": "d0e63299-c882-42ea-9b30-23a6e8b5a63e"
},
"outputs": [
{
"data": {
"text/plain": [
"[{'user': '{\"username\": \"ashley97\", \"name\": \"Miranda Cobb\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 35, \"address\": \"00037 Bowers Mall Apt. 791\\\\nHuangfurt, CA 81423\"}',\n",
" 'campaigns': [{'cmp_name': 'KTR_20191001_20210104_30-45_B_EUR',\n",
" 'cmp_bgt': 542731,\n",
" 'cmp_spent': 320061,\n",
" 'cmp_clicks': 56556,\n",
" 'cmp_impr': 499998},\n",
" {'cmp_name': 'KTR_20181119_20190516_40-60_B_EUR',\n",
" 'cmp_bgt': 563162,\n",
" 'cmp_spent': 294891,\n",
" 'cmp_clicks': 66268,\n",
" 'cmp_impr': 500001}]},\n",
" {'user': '{\"username\": \"lstewart\", \"name\": \"Julie Harris\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 83, \"address\": \"83858 Jones Streets Suite 212\\\\nSarahburgh, SD 84019\"}',\n",
" 'campaigns': [{'cmp_name': 'KTR_20190603_20200807_35-45_M_EUR',\n",
" 'cmp_bgt': 354212,\n",
" 'cmp_spent': 30657,\n",
" 'cmp_clicks': 29316,\n",
" 'cmp_impr': 500001},\n",
" {'cmp_name': 'GRZ_20191119_20210913_40-55_B_GBP',\n",
" 'cmp_bgt': 994364,\n",
" 'cmp_spent': 122783,\n",
" 'cmp_clicks': 63457,\n",
" 'cmp_impr': 499999}]}]"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# fetch simulated rough data\n",
"rough_data = get_data(users)\n",
"\n",
"rough_data[:2] # let's take a peek"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238
},
"colab_type": "code",
"id": "Bf6rqL-d5QPx",
"outputId": "9c0710be-c3e8-4f5b-c8c9-1df0508396ed"
},
"outputs": [
{
"data": {
"text/plain": [
"[{'cmp_name': 'KTR_20191001_20210104_30-45_B_EUR',\n",
" 'cmp_bgt': 542731,\n",
" 'cmp_spent': 320061,\n",
" 'cmp_clicks': 56556,\n",
" 'cmp_impr': 499998,\n",
" 'user': '{\"username\": \"ashley97\", \"name\": \"Miranda Cobb\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 35, \"address\": \"00037 Bowers Mall Apt. 791\\\\nHuangfurt, CA 81423\"}'},\n",
" {'cmp_name': 'KTR_20181119_20190516_40-60_B_EUR',\n",
" 'cmp_bgt': 563162,\n",
" 'cmp_spent': 294891,\n",
" 'cmp_clicks': 66268,\n",
" 'cmp_impr': 500001,\n",
" 'user': '{\"username\": \"ashley97\", \"name\": \"Miranda Cobb\", \"gender\": \"F\", \"email\": \"
[email protected]\", \"age\": 35, \"address\": \"00037 Bowers Mall Apt. 791\\\\nHuangfurt, CA 81423\"}'}]"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Let's start from having a different version of the data\n",
"# I want a list whose items will be dicts. Each dict is \n",
"# the original campaign dict plus the user JSON\n",
"\n",
"data = []\n",
"for datum in rough_data:\n",
" for campaign in datum['campaigns']:\n",
" campaign.update({'user': datum['user']})\n",
" data.append(campaign)\n",
"data[:2] # let's take another peek"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "25HN96S65QP0"
},
"source": [
"### Creating the DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 195
},
"colab_type": "code",
"id": "g-te8Uy85QP1",
"outputId": "82764019-7e45-4384-e30a-136e3468b786"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"cmp_bgt | \n",
"cmp_clicks | \n",
"cmp_impr | \n",
"cmp_name | \n",
"cmp_spent | \n",
"user | \n",
"
\n",
"\n",
"\n",
"\n",
"0 | \n",
"542731 | \n",
"56556 | \n",
"499998 | \n",
"KTR_20191001_20210104_30-45_B_EUR | \n",
"320061 | \n",
"{\"username\": \"ashley97\", \"name\": \"Miranda Cobb... | \n",
"
\n",
"\n",
"1 | \n",
"563162 | \n",
"66268 | \n",
"500001 | \n",
"KTR_20181119_20190516_40-60_B_EUR | \n",
"294891 | \n",
"{\"username\": \"ashley97\", \"name\": \"Miranda Cobb... | \n",
"
\n",
"\n",
"2 | \n",
"354212 | \n",
"29316 | \n",
"500001 | \n",
"KTR_20190603_20200807_35-45_M_EUR | \n",
"30657 | \n",
"{\"username\": \"lstewart\", \"name\": \"Julie Harris... | \n",
"
\n",
"\n",
"3 | \n",
"994364 | \n",
"63457 | \n",
"499999 | \n",
"GRZ_20191119_20210913_40-55_B_GBP | \n",
"122783 | \n",
"{\"username\": \"lstewart\", \"name\": \"Julie Harris... | \n",
"
\n",
"\n",
"4 | \n",
"205775 | \n",
"14040 | \n",
"499997 | \n",
"KTR_20200330_20211027_20-40_F_GBP | \n",
"42387 | \n",
"{\"username\": \"sharon85\", \"name\": \"Christopher ... | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" cmp_bgt cmp_clicks cmp_impr cmp_name \\\n",
"0 542731 56556 499998 KTR_20191001_20210104_30-45_B_EUR \n",
"1 563162 66268 500001 KTR_20181119_20190516_40-60_B_EUR \n",
"2 354212 29316 500001 KTR_20190603_20200807_35-45_M_EUR \n",
"3 994364 63457 499999 GRZ_20191119_20210913_40-55_B_GBP \n",
"4 205775 14040 499997 KTR_20200330_20211027_20-40_F_GBP \n",
"\n",
" cmp_spent user \n",
"0 320061 {\"username\": \"ashley97\", \"name\": \"Miranda Cobb... \n",
"1 294891 {\"username\": \"ashley97\", \"name\": \"Miranda Cobb... \n",
"2 30657 {\"username\": \"lstewart\", \"name\": \"Julie Harris... \n",
"3 122783 {\"username\": \"lstewart\", \"name\": \"Julie Harris... \n",
"4 42387 {\"username\": \"sharon85\", \"name\": \"Christopher ... "
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# data is good enough for creating an initial DataFrame\n",
"df = DataFrame(data)\n",
"\n",
"# let's take a peek at the first 5 rows, to make sure\n",
"# nothing weird has happened\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 134
},
"colab_type": "code",
"id": "NXdilDku5QP6",
"outputId": "f66cb3d5-c03c-408c-d51a-fc55e10cfece"
},
"outputs": [
{
"data": {
"text/plain": [
"cmp_bgt 4920\n",
"cmp_clicks 4920\n",
"cmp_impr 4920\n",
"cmp_name 4920\n",
"cmp_spent 4920\n",
"user 4920\n",
"dtype: int64"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# OK! DataFrame is alive and well!\n",
"# let's get a sense of how many rows there are and what is\n",
"# their structure.\n",
"df.count()"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 284
},
"colab_type": "code",
"id": "1ag3HSMI5QP8",
"outputId": "fb213f9c-fd7e-491f-8f6f-f3ca4f094926"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"cmp_bgt | \n",
"cmp_clicks | \n",
"cmp_impr | \n",
"cmp_spent | \n",
"
\n",
"\n",
"\n",
"\n",
"count | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"
\n",
"\n",
"mean | \n",
"504930.672358 | \n",
"40222.850203 | \n",
"499999.458943 | \n",
"250462.501016 | \n",
"
\n",
"\n",
"std | \n",
"286093.538484 | \n",
"21672.391822 | \n",
"2.026843 | \n",
"218451.424805 | \n",
"
\n",
"\n",
"min | \n",
"1322.000000 | \n",
"701.000000 | \n",
"499992.000000 | \n",
"143.000000 | \n",
"
\n",
"\n",
"25% | \n",
"255732.500000 | \n",
"22224.500000 | \n",
"499998.000000 | \n",
"70305.500000 | \n",
"
\n",
"\n",
"50% | \n",
"504126.000000 | \n",
"36854.500000 | \n",
"499999.000000 | \n",
"189206.000000 | \n",
"
\n",
"\n",
"75% | \n",
"753168.250000 | \n",
"55770.250000 | \n",
"500001.000000 | \n",
"386425.750000 | \n",
"
\n",
"\n",
"max | \n",
"999870.000000 | \n",
"98858.000000 | \n",
"500007.000000 | \n",
"981524.000000 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" cmp_bgt cmp_clicks cmp_impr cmp_spent\n",
"count 4920.000000 4920.000000 4920.000000 4920.000000\n",
"mean 504930.672358 40222.850203 499999.458943 250462.501016\n",
"std 286093.538484 21672.391822 2.026843 218451.424805\n",
"min 1322.000000 701.000000 499992.000000 143.000000\n",
"25% 255732.500000 22224.500000 499998.000000 70305.500000\n",
"50% 504126.000000 36854.500000 499999.000000 189206.000000\n",
"75% 753168.250000 55770.250000 500001.000000 386425.750000\n",
"max 999870.000000 98858.000000 500007.000000 981524.000000"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"id": "7DcOF6xY5QP_",
"outputId": "adc4952e-453b-431c-8f71-bb9eab346bb5"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"cmp_bgt | \n",
"cmp_clicks | \n",
"cmp_impr | \n",
"cmp_name | \n",
"cmp_spent | \n",
"user | \n",
"
\n",
"\n",
"\n",
"\n",
"2922 | \n",
"999870 | \n",
"40078 | \n",
"500000 | \n",
"KTR_20180522_20181120_25-40_M_GBP | \n",
"76771 | \n",
"{\"username\": \"christine59\", \"name\": \"Stephanie... | \n",
"
\n",
"\n",
"1765 | \n",
"999775 | \n",
"34657 | \n",
"499998 | \n",
"BYU_20200125_20201213_30-35_F_USD | \n",
"571672 | \n",
"{\"username\": \"yhowell\", \"name\": \"Jonathan Loga... | \n",
"
\n",
"\n",
"178 | \n",
"999756 | \n",
"36870 | \n",
"500000 | \n",
"AKX_20180630_20190615_20-25_B_EUR | \n",
"262324 | \n",
"{\"username\": \"scott93\", \"name\": \"Nicholas Vill... | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" cmp_bgt cmp_clicks cmp_impr cmp_name \\\n",
"2922 999870 40078 500000 KTR_20180522_20181120_25-40_M_GBP \n",
"1765 999775 34657 499998 BYU_20200125_20201213_30-35_F_USD \n",
"178 999756 36870 500000 AKX_20180630_20190615_20-25_B_EUR \n",
"\n",
" cmp_spent user \n",
"2922 76771 {\"username\": \"christine59\", \"name\": \"Stephanie... \n",
"1765 571672 {\"username\": \"yhowell\", \"name\": \"Jonathan Loga... \n",
"178 262324 {\"username\": \"scott93\", \"name\": \"Nicholas Vill... "
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# let's see which are the top and bottom 3 campaigns according\n",
"# to budget (regardless of the currency)\n",
"df.sort_values(by=['cmp_bgt'], ascending=False).head(3)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"id": "h9WFkFnz5QQC",
"outputId": "5e366b8d-50a9-4ce3-a5b4-ff5ff272a0dc"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"cmp_bgt | \n",
"cmp_clicks | \n",
"cmp_impr | \n",
"cmp_name | \n",
"cmp_spent | \n",
"user | \n",
"
\n",
"\n",
"\n",
"\n",
"4230 | \n",
"1755 | \n",
"44369 | \n",
"500002 | \n",
"GRZ_20190518_20210124_30-40_M_USD | \n",
"1430 | \n",
"{\"username\": \"mejiachristine\", \"name\": \"Megan ... | \n",
"
\n",
"\n",
"2321 | \n",
"1607 | \n",
"57234 | \n",
"500000 | \n",
"AKX_20190824_20200218_35-45_B_EUR | \n",
"1192 | \n",
"{\"username\": \"zreynolds\", \"name\": \"Michael Mor... | \n",
"
\n",
"\n",
"1433 | \n",
"1322 | \n",
"11909 | \n",
"500002 | \n",
"GRZ_20181012_20200705_30-45_F_GBP | \n",
"243 | \n",
"{\"username\": \"brownbrooke\", \"name\": \"Michelle ... | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" cmp_bgt cmp_clicks cmp_impr cmp_name \\\n",
"4230 1755 44369 500002 GRZ_20190518_20210124_30-40_M_USD \n",
"2321 1607 57234 500000 AKX_20190824_20200218_35-45_B_EUR \n",
"1433 1322 11909 500002 GRZ_20181012_20200705_30-45_F_GBP \n",
"\n",
" cmp_spent user \n",
"4230 1430 {\"username\": \"mejiachristine\", \"name\": \"Megan ... \n",
"2321 1192 {\"username\": \"zreynolds\", \"name\": \"Michael Mor... \n",
"1433 243 {\"username\": \"brownbrooke\", \"name\": \"Michelle ... "
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sort_values(by=['cmp_bgt'], ascending=False).tail(3)"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "TcdxS56G5QQE"
},
"source": [
"## 3 - Manipulating the DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"id": "dEr7VsTU5QQH",
"outputId": "b1cff3fd-e9b7-4667-df89-31049582d56d"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Type | \n",
"Start | \n",
"End | \n",
"Age | \n",
"Gender | \n",
"Currency | \n",
"
\n",
"\n",
"\n",
"\n",
"0 | \n",
"KTR | \n",
"01-10-2019 | \n",
"04-01-2021 | \n",
"30-45 | \n",
"B | \n",
"EUR | \n",
"
\n",
"\n",
"1 | \n",
"KTR | \n",
"11-19-2018 | \n",
"05-16-2019 | \n",
"40-60 | \n",
"B | \n",
"EUR | \n",
"
\n",
"\n",
"2 | \n",
"KTR | \n",
"03-06-2019 | \n",
"07-08-2020 | \n",
"35-45 | \n",
"M | \n",
"EUR | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Type Start End Age Gender Currency\n",
"0 KTR 01-10-2019 04-01-2021 30-45 B EUR\n",
"1 KTR 11-19-2018 05-16-2019 40-60 B EUR\n",
"2 KTR 03-06-2019 07-08-2020 35-45 M EUR"
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# first, let's explode cmp_name into its components\n",
"# and get a separate DataFrame for those\n",
"\n",
"def unpack_campaign_name(name):\n",
" # very optimistic method, assumes data in campaign name\n",
" # is always in good state\n",
" type_, start, end, age, gender, currency = name.split('_')\n",
" start = parse(start).date\n",
" end = parse(end).date\n",
" return type_, start, end, age, gender, currency\n",
"\n",
"campaign_data = df['cmp_name'].apply(unpack_campaign_name)\n",
"campaign_cols = [\n",
" 'Type', 'Start', 'End', 'Age', 'Gender', 'Currency']\n",
"campaign_df = DataFrame(\n",
" campaign_data.tolist(), columns=campaign_cols, index=df.index)\n",
"\n",
"#Change the date format of the Start and End to US format mm/dd/yyyy which is the US format.\n",
"#Here we have used strftime function which converts a time stamp to a desired date format that we want.\n",
"#It is in mm-dd-yyyy format.\n",
"#strftime('%m-%d-%Y') return a string. So date is stored as string now instead of date format.\n",
"\n",
"campaign_df['Start'] = campaign_df['Start'].apply(lambda x: x.strftime('%m-%d-%Y')) #Change date format to US format\n",
"campaign_df['End'] = campaign_df['End'].apply(lambda x: x.strftime('%m-%d-%Y')) #Change date format to US format\n",
"\n",
"campaign_df.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "T0wUrDcq5QQL"
},
"outputs": [],
"source": [
"# let's join the two dataframes\n",
"df = df.join(campaign_df)"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"id": "bdyO8URC5QQP",
"outputId": "59b375d8-91e6-497c-a17e-86952f0151d1"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"cmp_name | \n",
"Type | \n",
"Start | \n",
"End | \n",
"Age | \n",
"Gender | \n",
"Currency | \n",
"
\n",
"\n",
"\n",
"\n",
"0 | \n",
"KTR_20191001_20210104_30-45_B_EUR | \n",
"KTR | \n",
"01-10-2019 | \n",
"04-01-2021 | \n",
"30-45 | \n",
"B | \n",
"EUR | \n",
"
\n",
"\n",
"1 | \n",
"KTR_20181119_20190516_40-60_B_EUR | \n",
"KTR | \n",
"11-19-2018 | \n",
"05-16-2019 | \n",
"40-60 | \n",
"B | \n",
"EUR | \n",
"
\n",
"\n",
"2 | \n",
"KTR_20190603_20200807_35-45_M_EUR | \n",
"KTR | \n",
"03-06-2019 | \n",
"07-08-2020 | \n",
"35-45 | \n",
"M | \n",
"EUR | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" cmp_name Type Start End Age \\\n",
"0 KTR_20191001_20210104_30-45_B_EUR KTR 01-10-2019 04-01-2021 30-45 \n",
"1 KTR_20181119_20190516_40-60_B_EUR KTR 11-19-2018 05-16-2019 40-60 \n",
"2 KTR_20190603_20200807_35-45_M_EUR KTR 03-06-2019 07-08-2020 35-45 \n",
"\n",
" Gender Currency \n",
"0 B EUR \n",
"1 B EUR \n",
"2 M EUR "
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# and take a peek: good! We didn't screw up!\n",
"df[['cmp_name'] + campaign_cols].head(3)"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "psb4jLFl5QQS"
},
"outputs": [],
"source": [
"# now let's do the same for the JSON user object\n",
"\n",
"def unpack_user_json(user):\n",
" # very optimistic as well, expects user objects\n",
" # to have all attributes\n",
" user = json.loads(user.strip())\n",
" return [\n",
" user['username'],\n",
" user['email'],\n",
" user['name'],\n",
" user['gender'],\n",
" user['age'],\n",
" user['address'],\n",
" ]\n",
"\n",
"user_data = df['user'].apply(unpack_user_json)\n",
"user_cols = [\n",
" 'username', 'email', 'name', 'gender', 'age', 'address']\n",
"user_df = DataFrame(\n",
" user_data.tolist(), columns=user_cols, index=df.index)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Qztu_l7j5QQV"
},
"outputs": [],
"source": [
"# let's join the two dataframes\n",
"df = df.join(user_df)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 106
},
"colab_type": "code",
"id": "aLndxgKp5QQZ",
"outputId": "41672907-6159-4554-cf71-ce140dfc0fa4"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"user | \n",
"username | \n",
"email | \n",
"name | \n",
"gender | \n",
"age | \n",
"address | \n",
"
\n",
"\n",
"\n",
"\n",
"0 | \n",
"{\"username\": \"ashley97\", \"name\": \"Miranda Cobb... | \n",
"ashley97 | \n",
"[email protected] | \n",
"Miranda Cobb | \n",
"F | \n",
"35 | \n",
"00037 Bowers Mall Apt. 791\\nHuangfurt, CA 81423 | \n",
"
\n",
"\n",
"1 | \n",
"{\"username\": \"ashley97\", \"name\": \"Miranda Cobb... | \n",
"ashley97 | \n",
"[email protected] | \n",
"Miranda Cobb | \n",
"F | \n",
"35 | \n",
"00037 Bowers Mall Apt. 791\\nHuangfurt, CA 81423 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" user username \\\n",
"0 {\"username\": \"ashley97\", \"name\": \"Miranda Cobb... ashley97 \n",
"1 {\"username\": \"ashley97\", \"name\": \"Miranda Cobb... ashley97 \n",
"\n",
" email name gender age \\\n",
"0
[email protected] Miranda Cobb F 35 \n",
"1
[email protected] Miranda Cobb F 35 \n",
"\n",
" address \n",
"0 00037 Bowers Mall Apt. 791\\nHuangfurt, CA 81423 \n",
"1 00037 Bowers Mall Apt. 791\\nHuangfurt, CA 81423 "
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# and take a peek: good! Also this time we didn't screw up!\n",
"df[['user'] + user_cols].head(2)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "RgZPZq7x5QQb"
},
"outputs": [],
"source": [
"# now we have the DataFrame completely expanded, so it's\n",
"# time to play with it. First, let's fix those ugly column names\n",
"better_columns = [\n",
" 'Budget', 'Clicks', 'Impressions',\n",
" 'cmp_name', 'Spent', 'user',\n",
" 'Type', 'Start', 'End',\n",
" 'Target Age', 'Target Gender', 'Currency',\n",
" 'Username', 'Email', 'Name',\n",
" 'Gender', 'Age', 'Address',\n",
"]\n",
"df.columns = better_columns"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Pj7kk6YZ5QQd"
},
"outputs": [],
"source": [
"# let's add three other columns\n",
"\n",
"def calculate_extra_columns(df):\n",
" # Click Through Rate\n",
" df['CTR'] = df['Clicks'] / df['Impressions']\n",
" # Cost Per Click\n",
" df['CPC'] = df['Spent'] / df['Clicks']\n",
" # Cost Per Impression\n",
" df['CPI'] = df['Spent'] / df['Impressions']\n",
" \n",
"calculate_extra_columns(df)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"id": "2kjY_m5g5QQf",
"outputId": "d9b93b70-9434-4c53-ff3c-36a1b443d0a0"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Spent | \n",
"Clicks | \n",
"Impressions | \n",
"CTR | \n",
"CPC | \n",
"CPI | \n",
"
\n",
"\n",
"\n",
"\n",
"0 | \n",
"320061 | \n",
"56556 | \n",
"499998 | \n",
"0.113112 | \n",
"5.659187 | \n",
"0.640125 | \n",
"
\n",
"\n",
"1 | \n",
"294891 | \n",
"66268 | \n",
"500001 | \n",
"0.132536 | \n",
"4.449976 | \n",
"0.589781 | \n",
"
\n",
"\n",
"2 | \n",
"30657 | \n",
"29316 | \n",
"500001 | \n",
"0.058632 | \n",
"1.045743 | \n",
"0.061314 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Spent Clicks Impressions CTR CPC CPI\n",
"0 320061 56556 499998 0.113112 5.659187 0.640125\n",
"1 294891 66268 500001 0.132536 4.449976 0.589781\n",
"2 30657 29316 500001 0.058632 1.045743 0.061314"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# let's take a peek\n",
"df[['Spent', 'Clicks', 'Impressions',\n",
" 'CTR', 'CPC', 'CPI']].head(3)"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 67
},
"colab_type": "code",
"id": "oz9Kzare5QQi",
"outputId": "4f68aba0-0982-480d-8ab5-8a557a86dd71"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CTR: 0.1131124524498098 0.1131124524498098\n",
"CPC: 5.659187354126883 5.659187354126883\n",
"CPI: 0.640124560498242 0.640124560498242\n"
]
}
],
"source": [
"# let's take the values of the first row and verify\n",
"clicks = df['Clicks'][0]\n",
"impressions = df['Impressions'][0]\n",
"spent = df['Spent'][0]\n",
"\n",
"CTR = df['CTR'][0]\n",
"CPC = df['CPC'][0]\n",
"CPI = df['CPI'][0]\n",
"\n",
"print('CTR:', CTR, clicks / impressions)\n",
"print('CPC:', CPC, spent / clicks)\n",
"print('CPI:', CPI, spent / impressions)"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "DA3dgME75QQl"
},
"outputs": [],
"source": [
"# let's also add the name of the Day when campaign starts\n",
"def get_day_of_the_week(day):\n",
" #To get day of week, first we will convert the date that we convert earlier to string with pd.to_datetime()\n",
" day = pd.to_datetime(day, format = '%m-%d-%Y') #Change to date format\n",
" number_to_day = dict(enumerate(calendar.day_name, 1)) #Get the day number\n",
" return number_to_day[day.isoweekday()] #Based on the day number, get the day name of the week\n",
"\n",
"def get_duration(row):\n",
" #To get difference between start and end, we will convert start and end date to date value first because we changed it\n",
" #to string in the cell above. To do so, we will use pandas function to_datetime and will change it into date format\n",
" #The string format of the date is in format 'mm-dd-yyyy', so to parse it to date format, we will add format value \n",
" #and assign it to '%m-%d-%Y'\n",
" \n",
" start = pd.to_datetime(row['Start'], format = '%m-%d-%Y') #Convert start value to date format\n",
" end = pd.to_datetime(row['End'], format = '%m-%d-%Y') #Convert end value to date format\n",
" \n",
" dur = (end - start).days #Get the difference between end and start time\n",
" \n",
" if dur < 0: #If duration is less negative, change it to positive to remove negative values\n",
" dur = abs(dur)\n",
" \n",
" return dur\n",
"\n",
"df['Day of Week'] = df['Start'].apply(get_day_of_the_week)\n",
"df['Duration'] = df.apply(get_duration, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 136
},
"colab_type": "code",
"id": "3eW6ktV25QQq",
"outputId": "9879f661-892c-40e8-cd18-65d82e2a819f"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Start | \n",
"End | \n",
"Duration | \n",
"Day of Week | \n",
"
\n",
"\n",
"\n",
"\n",
"0 | \n",
"01-10-2019 | \n",
"04-01-2021 | \n",
"812 | \n",
"Thursday | \n",
"
\n",
"\n",
"1 | \n",
"11-19-2018 | \n",
"05-16-2019 | \n",
"178 | \n",
"Monday | \n",
"
\n",
"\n",
"2 | \n",
"03-06-2019 | \n",
"07-08-2020 | \n",
"490 | \n",
"Wednesday | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Start End Duration Day of Week\n",
"0 01-10-2019 04-01-2021 812 Thursday\n",
"1 11-19-2018 05-16-2019 178 Monday\n",
"2 03-06-2019 07-08-2020 490 Wednesday"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# let's verify\n",
"df[['Start', 'End', 'Duration', 'Day of Week']].head(3)"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "rDpWyScc5QQs"
},
"outputs": [],
"source": [
"# now let's get rid of the cmp_name and user columns,\n",
"# which we don't need any more, and address too\n",
"final_columns = [\n",
" 'Type', 'Start', 'End', 'Duration', 'Day of Week', 'Budget',\n",
" 'Currency', 'Clicks', 'Impressions', 'Spent', 'CTR', 'CPC',\n",
" 'CPI', 'Target Age', 'Target Gender', 'Username', 'Email',\n",
" 'Name', 'Gender', 'Age'\n",
"]\n",
"df = df[final_columns]"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "AF0Xw7Ih5QQu"
},
"source": [
"## 4 - Saving to a file in different formats"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "FbA-2X085QQu"
},
"outputs": [],
"source": [
"# CSV format\n",
"# df.to_csv('df.csv')"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Jm1btUom5QQz"
},
"outputs": [],
"source": [
"# JSON format\n",
"# df.to_json('df.json')"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "SVZ8Ez2Y5QQ6"
},
"outputs": [],
"source": [
"# Spreadsheet format\n",
"# df.to_excel('df.xls')"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "YUAkFPfh5QRB"
},
"source": [
"## 5 - Visualizing results"
]
},
{
"cell_type": "markdown",
"metadata": {
"colab_type": "text",
"id": "9QpobLjT5QRD"
},
"source": [
"First let's take care of the graphics, we need to instruct the notebook to use matplotlib inline and the serif font family."
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "CZMinjG95QRE"
},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "Pzwj7xCl5QRG"
},
"outputs": [],
"source": [
"import pylab\n",
"pylab.rcParams.update({'font.family' : 'serif'})"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 284
},
"colab_type": "code",
"id": "mYxt9wqm5QRL",
"outputId": "70b7241b-68eb-4d49-cd3c-61c7eaf97fdb"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Duration | \n",
"Budget | \n",
"Clicks | \n",
"Impressions | \n",
"Spent | \n",
"CTR | \n",
"CPC | \n",
"CPI | \n",
"Age | \n",
"
\n",
"\n",
"\n",
"\n",
"count | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"4920.000000 | \n",
"
\n",
"\n",
"mean | \n",
"383.604268 | \n",
"504930.672358 | \n",
"40222.850203 | \n",
"499999.458943 | \n",
"250462.501016 | \n",
"0.080446 | \n",
"9.862352 | \n",
"0.500926 | \n",
"52.943089 | \n",
"
\n",
"\n",
"std | \n",
"230.213040 | \n",
"286093.538484 | \n",
"21672.391822 | \n",
"2.026843 | \n",
"218451.424805 | \n",
"0.043345 | \n",
"18.669847 | \n",
"0.436903 | \n",
"21.061735 | \n",
"
\n",
"\n",
"min | \n",
"1.000000 | \n",
"1322.000000 | \n",
"701.000000 | \n",
"499992.000000 | \n",
"143.000000 | \n",
"0.001402 | \n",
"0.002627 | \n",
"0.000286 | \n",
"18.000000 | \n",
"
\n",
"\n",
"25% | \n",
"189.000000 | \n",
"255732.500000 | \n",
"22224.500000 | \n",
"499998.000000 | \n",
"70305.500000 | \n",
"0.044449 | \n",
"1.823865 | \n",
"0.140611 | \n",
"34.000000 | \n",
"
\n",
"\n",
"50% | \n",
"372.000000 | \n",
"504126.000000 | \n",
"36854.500000 | \n",
"499999.000000 | \n",
"189206.000000 | \n",
"0.073709 | \n",
"5.180995 | \n",
"0.378414 | \n",
"52.000000 | \n",
"
\n",
"\n",
"75% | \n",
"561.000000 | \n",
"753168.250000 | \n",
"55770.250000 | \n",
"500001.000000 | \n",
"386425.750000 | \n",
"0.111541 | \n",
"11.586214 | \n",
"0.772860 | \n",
"71.000000 | \n",
"
\n",
"\n",
"max | \n",
"1064.000000 | \n",
"999870.000000 | \n",
"98858.000000 | \n",
"500007.000000 | \n",
"981524.000000 | \n",
"0.197716 | \n",
"614.512590 | \n",
"1.963056 | \n",
"90.000000 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Duration Budget Clicks Impressions Spent \\\n",
"count 4920.000000 4920.000000 4920.000000 4920.000000 4920.000000 \n",
"mean 383.604268 504930.672358 40222.850203 499999.458943 250462.501016 \n",
"std 230.213040 286093.538484 21672.391822 2.026843 218451.424805 \n",
"min 1.000000 1322.000000 701.000000 499992.000000 143.000000 \n",
"25% 189.000000 255732.500000 22224.500000 499998.000000 70305.500000 \n",
"50% 372.000000 504126.000000 36854.500000 499999.000000 189206.000000 \n",
"75% 561.000000 753168.250000 55770.250000 500001.000000 386425.750000 \n",
"max 1064.000000 999870.000000 98858.000000 500007.000000 981524.000000 \n",
"\n",
" CTR CPC CPI Age \n",
"count 4920.000000 4920.000000 4920.000000 4920.000000 \n",
"mean 0.080446 9.862352 0.500926 52.943089 \n",
"std 0.043345 18.669847 0.436903 21.061735 \n",
"min 0.001402 0.002627 0.000286 18.000000 \n",
"25% 0.044449 1.823865 0.140611 34.000000 \n",
"50% 0.073709 5.180995 0.378414 52.000000 \n",
"75% 0.111541 11.586214 0.772860 71.000000 \n",
"max 0.197716 614.512590 1.963056 90.000000 "
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 402
},
"colab_type": "code",
"id": "vg3UZLDj5QRM",
"outputId": "334be30d-b83c-48ad-e7af-61ab5cf34449"
},
"outputs": [
{
"data": {
"image/png":...