{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#installing the libraries"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: geoplotlib in c:\\users\\dell\\anaconda3\\lib\\site-packages (0.3.2)\n"
]
}
],
"source": [
"!pip install geoplotlib"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pyglet in c:\\users\\dell\\anaconda3\\lib\\site-packages (1.5.5)\n"
]
}
],
"source": [
"!pip install pyglet"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import geoplotlib"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"#to display the maps in the jupyter notebook\n",
"from IPython.display import Image"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#reading the data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\DELL\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3058: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" interactivity=interactivity, compiler=compiler, result=result)\n"
]
}
],
"source": [
"#data is saved at the same location as the current file location\n",
"\n",
"df = pd.read_csv(\"world_cities_pop.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"#because we have mixed datatypes , so we force \n",
"#python to consider it as a character because of the mixed data types\n",
"#reading data again with modifications\n",
"df = pd.read_csv(\"world_cities_pop.csv\" , dtype = {'Region' : np.str})"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['06', '07', '04', '05', '02', '03', '08', '01', '29', '10', '24',\n",
" '09', '35', '42', '11', '27', '39', '28', '26', '17', '41', '33',\n",
" '30', '13', '40', '18', '23', '19', '37', '14', '32', '36', '31',\n",
" '34', '38', nan, '00', '51', '46', '49', '43', '47', '44', '45',\n",
" '50', '48', '15', '12', '20', '16', '21', '22', '62', '68', '65',\n",
" '64', '66', '58', '60', '61', '71', '57'], dtype=object)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.unique(df['Region'])[:62]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(3173958, 7)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#size of the orignal dataset\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Country object\n",
"City object\n",
"AccentCity object\n",
"Region object\n",
"Population float64\n",
"Latitude float64\n",
"Longitude float64\n",
"dtype: object"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#checking the data types\n",
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Country | \n",
"City | \n",
"AccentCity | \n",
"Region | \n",
"Population | \n",
"Latitude | \n",
"Longitude | \n",
"
\n",
"\n",
"\n",
"\n",
"0 | \n",
"ad | \n",
"aixas | \n",
"Aixàs | \n",
"06 | \n",
"NaN | \n",
"42.483333 | \n",
"1.466667 | \n",
"
\n",
"\n",
"1 | \n",
"ad | \n",
"aixirivali | \n",
"Aixirivali | \n",
"06 | \n",
"NaN | \n",
"42.466667 | \n",
"1.500000 | \n",
"
\n",
"\n",
"2 | \n",
"ad | \n",
"aixirivall | \n",
"Aixirivall | \n",
"06 | \n",
"NaN | \n",
"42.466667 | \n",
"1.500000 | \n",
"
\n",
"\n",
"3 | \n",
"ad | \n",
"aixirvall | \n",
"Aixirvall | \n",
"06 | \n",
"NaN | \n",
"42.466667 | \n",
"1.500000 | \n",
"
\n",
"\n",
"4 | \n",
"ad | \n",
"aixovall | \n",
"Aixovall | \n",
"06 | \n",
"NaN | \n",
"42.466667 | \n",
"1.483333 | \n",
"
\n",
"\n",
"5 | \n",
"ad | \n",
"andorra | \n",
"Andorra | \n",
"07 | \n",
"NaN | \n",
"42.500000 | \n",
"1.516667 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Country City AccentCity Region Population Latitude Longitude\n",
"0 ad aixas Aixàs 06 NaN 42.483333 1.466667\n",
"1 ad aixirivali Aixirivali 06 NaN 42.466667 1.500000\n",
"2 ad aixirivall Aixirivall 06 NaN 42.466667 1.500000\n",
"3 ad aixirvall Aixirvall 06 NaN 42.466667 1.500000\n",
"4 ad aixovall Aixovall 06 NaN 42.466667 1.483333\n",
"5 ad andorra Andorra 07 NaN 42.500000 1.516667"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#viewing the first 6 rows of the dataframe\n",
"df[0:6]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"#removing one of the two columns - City / AccentCity because\n",
"#they are the same\n",
"df = df.drop(['AccentCity'], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Country | \n",
"City | \n",
"Region | \n",
"Population | \n",
"Latitude | \n",
"Longitude | \n",
"
\n",
"\n",
"\n",
"\n",
"0 | \n",
"ad | \n",
"aixas | \n",
"06 | \n",
"NaN | \n",
"42.483333 | \n",
"1.466667 | \n",
"
\n",
"\n",
"1 | \n",
"ad | \n",
"aixirivali | \n",
"06 | \n",
"NaN | \n",
"42.466667 | \n",
"1.500000 | \n",
"
\n",
"\n",
"2 | \n",
"ad | \n",
"aixirivall | \n",
"06 | \n",
"NaN | \n",
"42.466667 | \n",
"1.500000 | \n",
"
\n",
"\n",
"3 | \n",
"ad | \n",
"aixirvall | \n",
"06 | \n",
"NaN | \n",
"42.466667 | \n",
"1.500000 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Country City Region Population Latitude Longitude\n",
"0 ad aixas 06 NaN 42.483333 1.466667\n",
"1 ad aixirivali 06 NaN 42.466667 1.500000\n",
"2 ad aixirivall 06 NaN 42.466667 1.500000\n",
"3 ad aixirvall 06 NaN 42.466667 1.500000"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#viewing the dataset again\n",
"df[0:4]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# a)Remove all countries with Population as NaN. Call your DataFrame as df."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"#removing countries with population as NaN\n",
"#dataframe is named as df\n",
"df = df.dropna(subset = ['Population'])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(47980, 6)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Use df.shape to find the number of countries remaining in your DataFrame\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"#we see that number of countries has been reduced to 47980 from 3173958"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Country | \n",
"City | \n",
"Region | \n",
"Population | \n",
"Latitude | \n",
"Longitude | \n",
"
\n",
"\n",
"\n",
"\n",
"6 | \n",
"ad | \n",
"andorra la vella | \n",
"07 | \n",
"20430.0 | \n",
"42.500000 | \n",
"1.516667 | \n",
"
\n",
"\n",
"20 | \n",
"ad | \n",
"canillo | \n",
"02 | \n",
"3292.0 | \n",
"42.566667 | \n",
"1.600000 | \n",
"
\n",
"\n",
"32 | \n",
"ad | \n",
"encamp | \n",
"03 | \n",
"11224.0 | \n",
"42.533333 | \n",
"1.583333 | \n",
"
\n",
"\n",
"49 | \n",
"ad | \n",
"la massana | \n",
"04 | \n",
"7211.0 | \n",
"42.550000 | \n",
"1.516667 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Country City Region Population Latitude Longitude\n",
"6 ad andorra la vella 07 20430.0 42.500000 1.516667\n",
"20 ad canillo 02 3292.0 42.566667 1.600000\n",
"32 ad encamp 03 11224.0 42.533333 1.583333\n",
"49 ad la massana 04 7211.0 42.550000 1.516667"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Viewing the first 4 records after removing Population with NaN\n",
"df[0:4]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# b) Select a country (it can be the name of your country or any other of your choice).For this country, find the number of cities and the population.\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"#country to uppercase\n",
"df['Country'] = df['Country'].str.upper()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Country | \n",
"City | \n",
"Region | \n",
"Population | \n",
"Latitude | \n",
"Longitude | \n",
"
\n",
"\n",
"\n",
"\n",
"2323330 | \n",
"RO | \n",
"edera de jos | \n",
"16 | \n",
"3925.0 | \n",
"45.033333 | \n",
"25.633333 | \n",
"
\n",
"\n",
"2106084 | \n",
"PH | \n",
"bancasi | \n",
"02 | \n",
"3607.0 | \n",
"8.966667 | \n",
"125.466667 | \n",
"
\n",
"\n",
"2115472 | \n",
"PH | \n",
"clarin | \n",
"42 | \n",
"7125.0 | \n",
"8.202300 | \n",
"123.858200 | \n",
"
\n",
"\n",
"157926 | \n",
"AU | \n",
"nerang | \n",
"04 | \n",
"17684.0 | \n",
"-27.989410 | \n",
"153.336334 | \n",
"
\n",
"\n",
"304229 | \n",
"BR | \n",
"lajeado | \n",
"23 | \n",
"65408.0 | \n",
"-29.450000 | \n",
"-51.966667 | \n",
"
\n",
"\n",
"2924582 | \n",
"US | \n",
"steamboat springs | \n",
"CO | \n",
"9349.0 | \n",
"40.485000 | \n",
"-106.831111 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Country City Region Population Latitude Longitude\n",
"2323330 RO edera de jos 16 3925.0 45.033333 25.633333\n",
"2106084 PH bancasi 02 3607.0 8.966667 125.466667\n",
"2115472 PH clarin 42 7125.0 8.202300 123.858200\n",
"157926 AU nerang 04 17684.0 -27.989410 153.336334\n",
"304229 BR lajeado 23 65408.0 -29.450000 -51.966667\n",
"2924582 US steamboat springs CO 9349.0 40.485000 -106.831111"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.sample(6)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"#creating a dataframe df2 to store data for country India\n",
"df2 = df[df.Country == 'IN']"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2995, 6)"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.shape"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"259227307.0"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#population of the country selected (India)\n",
"df2['Population'].sum()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"we see the total population of India is 259227307"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2995\n"
]
}
],
"source": [
"#number of cities in country selected (India)\n",
"print(df2['City'].count())"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"City\n",
"abhayapuri 15803.0\n",
"abiramam 6837.0\n",
"abohar 130613.0\n",
"abu road 50266.0\n",
"achalpur 111287.0\n",
" ... \n",
"zahirabad 46509.0\n",
"zaidpur 33400.0\n",
"zamania 32011.0\n",
"ziro 13895.0\n",
"zunheboto 29500.0\n",
"Name: Population, Length: 2899, dtype: float64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#For this country, the population per city is as follows\n",
"df2.groupby('City')['Population'].agg('sum')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# c)Create a dot plot showing the cities in this country. No need to adjust the bounding box. "
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pycountry in c:\\users\\dell\\anaconda3\\lib\\site-packages (19.8.18)\n"
]
}
],
"source": [
"!pip install pycountry"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"import pycountry"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"india = pycountry.countries.get(alpha_2 = 'IN') "
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Country(alpha_2='IN', alpha_3='IND', name='India', numeric='356', official_name='Republic of India')"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"india"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'India'"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"india.name"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"#changing it to labels lat and long because geoplotlib requires to \n",
"#identify coordinates by labels\n",
"\n",
"df2 = df2.rename(columns = {'Latitude' : 'lat' , \n",
" 'Longitude' : 'lon'})"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
"\n",
"\n",
" | \n",
"Country | \n",
"City | \n",
"Region | \n",
"Population | \n",
"lat | \n",
"lon | \n",
"
\n",
"\n",
"\n",
"\n",
"1321980 | \n",
"IN | \n",
"adoni | \n",
"02 | \n",
"163649.0 | \n",
"15.633333 | \n",
"77.283333 | \n",
"
\n",
"\n",
"1357885 | \n",
"IN | \n",
"sundarnagar | \n",
"11 | \n",
"25340.0 | \n",
"31.533333 | \n",
"76.883333 | \n",
"
\n",
"\n",
"1357819 | \n",
"IN | \n",
"sultanpur | \n",
"35 | \n",
"9420.0 | \n",
"23.150000 | \n",
"77.933333 | \n",
"
\n",
"\n",
"
\n",
"
"
],
"text/plain": [
" Country City Region Population lat lon\n",
"1321980 IN adoni 02 163649.0 15.633333 77.283333\n",
"1357885 IN sundarnagar 11 25340.0 31.533333 76.883333\n",
"1357819 IN sultanpur 35 9420.0 23.150000 77.933333"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.sample(3)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"geoplotlib.dot(df2 , color = 'b')\n",
"geoplotlib.show()"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"n = 400"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"image/png":...