Answer To: There is a 2 part. need to take dataset from the keggle and please follow the rubric and...
Rohith answered on May 12 2021
__MACOSX/._57503
57503/.DS_Store
__MACOSX/57503/._.DS_Store
57503/57503_1.html
In [1]:
!wget -O train.csv https://raw.githubusercontent.com/tiwari91/Housing-Prices/master/train.csv
--2020-05-11 16:41:37-- https://raw.githubusercontent.com/tiwari91/Housing-Prices/master/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 460676 (450K) [text/plain]
Saving to: ‘train.csv’
train.csv 100%[===================>] 449.88K --.-KB/s in 0.06s
2020-05-11 16:41:38 (7.11 MB/s) - ‘train.csv’ saved [460676/460676]
In [2]:
ls
sample_data/ train.csv
EDA¶
In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
In [0]:
df_train = pd.read_csv('train.csv')
In [5]:
df_train.columns
Out[5]:
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
'SaleCondition', 'SalePrice'],
dtype='object')
In [6]:
#descriptive statistics summary
df_train['SalePrice'].describe()
Out[6]:
count 1460.000000
mean 180921.195890
std 79442.502883
min 34900.000000
25% 129975.000000
50% 163000.000000
75% 214000.000000
max 755000.000000
Name: SalePrice, dtype: float64
In [7]:
#histogram
sns.distplot(df_train['SalePrice']);
In [8]:
#skewness and kurtosis
print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())
Skewness: 1.882876
Kurtosis: 6.536282
In [9]:
#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
In [10]:
#scatter plot totalbsmtsf/saleprice
var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
In [11]:
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [12]:
var = 'YearBuilt'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);
In [13]:
#correlation matrix
corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
In [14]:
#saleprice correlation matrix
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(df_train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
In [15]:
#scatterplot
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_train[cols], size = 2.5)
plt.show();
In [16]:
#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
Out[16]:
Total Percent
PoolQC 1453 0.995205
MiscFeature 1406 0.963014
Alley 1369 0.937671
Fence 1179 0.807534
FireplaceQu 690 0.472603
LotFrontage 259 0.177397
GarageCond 81 0.055479
GarageType 81 0.055479
GarageYrBlt 81 0.055479
GarageFinish 81 0.055479
GarageQual 81 0.055479
BsmtExposure 38 0.026027
BsmtFinType2 38 0.026027
BsmtFinType1 37 0.025342
BsmtCond 37 0.025342
BsmtQual 37 0.025342
MasVnrArea 8 0.005479
MasVnrType 8 0.005479
Electrical 1 0.000685
Utilities 0 0.000000
In [17]:
#dealing with missing data
df_train = df_train.drop((missing_data[missing_data['Total'] > 1]).index,1)
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)
df_train.isnull().sum().max() #just checking that there's no missing data missing...
Out[17]:
0
In [18]:
#standardizing data
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)
outer range (low) of the distribution:
[[-1.83820775]
[-1.83303414]
[-1.80044422]
[-1.78282123]
[-1.77400974]
[-1.62295562]
[-1.6166617 ]
[-1.58519209]
[-1.58519209]
[-1.57269236]]
outer range (high) of the distribution:
[[3.82758058]
[4.0395221 ]
[4.49473628]
[4.70872962]
[4.728631 ]
[5.06034585]
[5.42191907]
[5.58987866]
[7.10041987]
[7.22629831]]
In [19]:
#bivariate analysis saleprice/grlivarea
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
In [0]:
#deleting points
df_train.sort_values(by = 'GrLivArea', ascending = False)[:2]
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)
In [21]:
#bivariate analysis saleprice/grlivarea
var = 'TotalBsmtSF'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'. Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
In [22]:
#histogram and normal probability plot
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
In [0]:
#applying log transformation
df_train['SalePrice'] = np.log(df_train['SalePrice'])
In [24]:
#transformed histogram and normal probability plot
sns.distplot(df_train['SalePrice'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['SalePrice'], plot=plt)
In [25]:
#histogram and normal probability plot
sns.distplot(df_train['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)
In [26]:
#transformed histogram and normal probability plot
sns.distplot(df_train['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['GrLivArea'], plot=plt)
In [27]:
#histogram and normal probability plot
sns.distplot(df_train['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train['TotalBsmtSF'], plot=plt)
In [0]:
#create column for new variable (one is enough because it's a binary categorical feature)
#if area>0 it gets 1, for area==0 it gets 0
df_train['HasBsmt'] = pd.Series(len(df_train['TotalBsmtSF']), index=df_train.index)
df_train['HasBsmt'] = 0
df_train.loc[df_train['TotalBsmtSF']>0,'HasBsmt'] = 1
In [0]:
#transform data
df_train.loc[df_train['HasBsmt']==1,'TotalBsmtSF'] = np.log(df_train['TotalBsmtSF'])
In [30]:
#histogram and normal probability plot
sns.distplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)
In [31]:
#scatter plot
plt.scatter(df_train['GrLivArea'], df_train['SalePrice']);
In [32]:
#scatter plot
plt.scatter(df_train[df_train['TotalBsmtSF']>0]['TotalBsmtSF'], df_train[df_train['TotalBsmtSF']>0]['SalePrice']);
Regression¶
In [33]:
#import some necessary librairies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)
from scipy import stats
from scipy.stats import norm, skew #for some statistics
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) #Limiting floats output to 3 decimal points
from subprocess import check_output
!wget -O test.csv https://raw.githubusercontent.com/tiwari91/Housing-Prices/master/test.csv
--2020-05-11 16:48:32-- https://raw.githubusercontent.com/tiwari91/Housing-Prices/master/test.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 451405 (441K) [text/plain]
Saving to: ‘test.csv’
test.csv 100%[===================>] 440.83K --.-KB/s in 0.06s
2020-05-11 16:48:32 (6.84 MB/s) - ‘test.csv’ saved [451405/451405]
In [0]:
#Now let's import and put the train and test datasets in pandas dataframe
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [35]:
#check the numbers of samples and features
print("The train data size before dropping Id feature is : {} ".format(train.shape))
print("The test data size before dropping Id feature is : {} ".format(test.shape))
#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']
#Now drop the 'Id' colum since it's unnecessary for the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)
#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(train.shape))
print("The test data size after dropping Id feature is : {} ".format(test.shape))
The train data size before dropping Id feature is : (1460, 81)
The test data size before dropping Id feature is : (1459, 80)
The train data size after dropping Id feature is : (1460, 80)
The test data size after dropping Id feature is : (1459, 79)
Data Preproceesing¶
In [36]:
fig, ax = plt.subplots()
ax.scatter(x = train['GrLivArea'], y = train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
In [37]:
#Deleting outliers
train = train.drop(train[(train['GrLivArea']>4000) & (train['SalePrice']<300000)].index)
#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train['GrLivArea'], train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
In [38]:
sns.distplot(train['SalePrice'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
mu = 180932.92 and sigma = 79467.79
In [39]:
#We use the numpy fuction log1p which applies log(1+x) to all elements of the column
train["SalePrice"] = np.log1p(train["SalePrice"])
#Check the new distribution
sns.distplot(train['SalePrice'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
mu = 12.02 and sigma = 0.40
Features Engineering¶
In [40]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SalePrice.values
all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))
all_data size is : (2917, 79)
In [41]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)
Out[41]:
Missing Ratio
PoolQC 99.691
MiscFeature 96.400
Alley 93.212
Fence 80.425
FireplaceQu 48.680
LotFrontage 16.661
GarageFinish 5.451
GarageYrBlt 5.451
GarageQual 5.451
GarageCond 5.451
GarageType 5.382
BsmtExposure 2.811
BsmtCond 2.811
BsmtQual 2.777
BsmtFinType2 2.743
BsmtFinType1 2.708
MasVnrType 0.823
MasVnrArea 0.788
MSZoning 0.137
BsmtFullBath 0.069
In [42]:
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
Out[42]:
Text(0.5, 1.0, 'Percent missing data by feature')
In [43]:
#Correlation map to see how features are correlated with SalePrice
corrmat = train.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)
Out[43]:
In [0]:
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
In [0]:
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
In [0]:
all_data["Alley"] = all_data["Alley"].fillna("None")
In [0]:
all_data["Fence"] = all_data["Fence"].fillna("None")
In [0]:
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")
In [0]:
#Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
lambda x: x.fillna(x.median()))
In [0]:
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
all_data[col] = all_data[col].fillna('None')
In [0]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
all_data[col] = all_data[col].fillna(0)
In [0]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
all_data[col] = all_data[col].fillna(0)
In [0]:
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
all_data[col] = all_data[col].fillna('None')
In [0]:
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
In [0]:
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
In [0]:
all_data = all_data.drop(['Utilities'], axis=1)
In [0]:
all_data["Functional"] = all_data["Functional"].fillna("Typ")
In [0]:
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
In [0]:
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
In [0]:
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
In [0]:
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])
In [0]:
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")
In [63]:
#Check remaining missing values if any
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head()
Out[63]:
Missing Ratio
In [0]:
#MSSubClass=The building class
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
#Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
#Year and month sold are transformed into categorical features.
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)
In [65]:
from sklearn.preprocessing import LabelEncoder
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
lbl = LabelEncoder()
lbl.fit(list(all_data[c].values))
all_data[c] = lbl.transform(list(all_data[c].values))
# shape
print('Shape all_data: {}'.format(all_data.shape))
Shape all_data: (2917, 78)
In [0]:
# Adding total sqfootage feature
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
In [67]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)
Skew in numerical features:
Out[67]:
Skew
MiscVal 21.940
PoolArea 17.689
LotArea 13.109
LowQualFinSF 12.085
3SsnPorch 11.372
LandSlope 4.973
KitchenAbvGr 4.301
BsmtFinSF2 4.145
EnclosedPorch 4.002
ScreenPorch 3.945
In [68]:
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
#all_data[feat] += 1
all_data[feat] = boxcox1p(all_data[feat], lam)
#all_data[skewed_features] = np.log1p(all_data[skewed_features])
There are 59 skewed numerical features to Box Cox transform
In [69]:
all_data = pd.get_dummies(all_data)
print(all_data.shape)
(2917, 220)
In [0]:
train = all_data[:ntrain]
test = all_data[ntrain:]
Modeling¶
In [0]:
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
In [0]:
#Validation function
n_folds = 5
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
return(rmse)
In [0]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
In [0]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
In [0]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
In [0]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
max_depth=4, max_features='sqrt',
min_samples_leaf=15, min_samples_split=10,
loss='huber', random_state =5)
In [0]:
# XG Boost
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468,
learning_rate=0.05, max_depth=3,
min_child_weight=1.7817, n_estimators=2200,
reg_alpha=0.4640, reg_lambda=0.8571,
subsample=0.5213, silent=1,
random_state =7, nthread = -1)
In [0]:
# Light GBM
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
learning_rate=0.05, n_estimators=720,
max_bin = 55, bagging_fraction = 0.8,
bagging_freq = 5, feature_fraction = 0.2319,
feature_fraction_seed=9, bagging_seed=9,
min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
In [79]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Lasso score: 0.1115 (0.0074)
In [80]:
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
ElasticNet score: 0.1116 (0.0074)
In [81]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Kernel Ridge score: 0.1153 (0.0075)
In [82]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Gradient Boosting score: 0.1167 (0.0083)
In [83]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Xgboost score: 0.1150 (0.0066)
In [84]:
score = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))
LGBM score: 0.1176 (0.0062)
Stacking models¶
In [0]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# we define clones of the original models to fit the data in
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
return np.mean(predictions, axis=1)
In [86]:
averaged_models = AveragingModels(models = (ENet, GBoost, KRR, lasso))
score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
Averaged base models score: 0.1087 (0.0077)
In [0]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, base_models, meta_model, n_folds=5):
self.base_models = base_models
self.meta_model = meta_model
self.n_folds = n_folds
# We again fit the data on clones of the original models
def fit(self, X, y):
self.base_models_ = [list() for x in self.base_models]
self.meta_model_ = clone(self.meta_model)
kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
# Train cloned base models then create out-of-fold predictions
# that are needed to train the cloned meta-model
out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
for i, model in enumerate(self.base_models):
for train_index, holdout_index in kfold.split(X, y):
instance = clone(model)
self.base_models_[i].append(instance)
instance.fit(X[train_index], y[train_index])
y_pred = instance.predict(X[holdout_index])
out_of_fold_predictions[holdout_index, i] = y_pred
# Now train the cloned meta-model using the out-of-fold predictions as new feature
self.meta_model_.fit(out_of_fold_predictions, y)
return self
#Do the predictions of all base models on the test data and use the averaged predictions as
#meta-features for the final prediction which is done by the meta-model
def predict(self, X):
meta_features = np.column_stack([
np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
for base_models in self.base_models_ ])
return self.meta_model_.predict(meta_features)
In [88]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
meta_model = lasso)
score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))
Stacking Averaged models score: 0.1081 (0.0073)
In [0]:
def rmsle(y, y_pred):
return np.sqrt(mean_squared_error(y, y_pred))
Final Training and Prediction¶
In [90]:
stacked_averaged_models.fit(train.values, y_train)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(y_train, stacked_train_pred))
0.07839506096664622
In [91]:
model_xgb.fit(train, y_train)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(y_train, xgb_train_pred))
0.0788580427618937
In [92]:
model_lgb.fit(train, y_train)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(y_train, lgb_train_pred))
0.07294132053731643
In [93]:
'''RMSE on the entire Train data when averaging'''
print('RMSLE score on train data:')
print(rmsle(y_train,stacked_train_pred*0.70 +
xgb_train_pred*0.15 + lgb_train_pred*0.15 ))
RMSLE score on train data:
0.07566957841091893
In [0]:
ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15
In [0]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = ensemble
In [96]:
sub.head()
Out[96]:
Id SalePrice
0 1461 119486.247
1 1462 159579.305
2 1463 187456.970
3 1464 195629.664
4 1465 192512.258
In [0]:
__MACOSX/57503/._57503_1.html
57503/cs-443-and-543-final-project-exam-isyin5im.docx
CS 443 and 543 – Machine Learning – Spring 2020 – Final Project/Exam (200 points)
(DUE MAY
The final project/ final exam will provide you an opportunity to use various machine learning concepts, tools, and techniques to answer real-life case scenarios. It will cover the breadth of the course, including concepts from supervised, unsupervised, and deep learning. Here you will apply the methods you have learned and apply it to various scenarios by developing and evaluating learning models in Python. Your project submission should be in the same format as the previous projects, in Jupyter format, and converted to HTML or PDF. If you have any additional data files, please include those along with work in a ZIP file.
Problem 1 : Predicting house prices (70 points)
When a buyer is seeking a new home, there are many factors that influence price negotiations than number of bedrooms or a white-picket fence. Using the Ames Housing dataset, you will use creative feature engineering and advanced regression techniques in building a model to predict sale price. You may find the dataset and additional information about the data here: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview
1) Your model should be developed using creative feature selection and engineering techniques and advanced regression techniques such that you can find the following:
a ) The predictor variables that should be included in the model (5 points)
b) Are there any interactions that should or should not be considered for including factors into the predictive model? (5 points)
2) You should perform appropriate exploratory data analysis (which should help you find the most important variables and gain a better understanding and visualization of the data) and predictive modeling (including identification of a quantitative evaluation measure, and optimization of the learning model by adding features, creating new features, adjusting parameters, etc). (20 points)
3) You will want to assess both simple (linear regression) as well as advanced regression models (including random forests and ensemble methods (gradient boosting)), find ways to optimize your model to minimize predictive error (by assessing K-cross validation, using stochastic gradient descent and ridge regression). (20 points)
4) What is your final regression model and how was it developed? (5 points)
5) Describe the various models you tried to evaluate their predictive power on the Ames Housing dataset. Which ones were the most effective and why? (5 points)
6) What were the metrics and parameters you had to optimize? (5 points)
7) What were the results of this optimizations? How would you evaluate your final predictive model and how might it be improved? (5 points)
Problem 2: Credit card fraud detection (130 points)
The use of credit card transactions as a popular payment for both online and offline transactions has led to an increasing number of daily fraud transactions. Automated methods are thus desired for credit card fraud detection, since it is challenging to accurately and promptly detect fraudulent transactions due to dramatic data imbalance and large variations of fraud transactions.
Your task here is to see how various supervised and unsupervised learning methods perform in detecting credit card fraud. The dataset you will use is the following Credit Card Fraud Detection dataset: https://www.kaggle.com/mlg-ulb/creditcardfraud
1) First you should assess the dataset, provide a description in words, especially as it pertains to how the variables are obtained, and some of its properties. (5 points)
2) Also, describe the preprocessing that was performed to obtain the data and the rationale for it. (10 points)
2) Next you will want to distinguish between normal and abnormal (fraud) credit card transactions. This will involve training classifiers on this dataset through supervised learning techniques and evaluating their performance. Using Python, you will want to compare several widely used approaches used for credit card fraud detection including Logistic Regression (LR), K-nearest neighbors (KNN), Support Vector Machine (SVM), Decision Tree (DT), Random Forest (RF), and eXtreme Gradient Boosting (XGBoost). (30 points)
3) You will also want to detect transactions that could be considered anomalous. First, using Python, attempt some of the widely used clustering algorithms (Kmeans, Gaussian Mixture Models, DBSCAN, Hierarchical Clustering). Describe their effectiveness by assessing their performance towards detecting anomalies in this dataset. (10 points)
4) There has recently been an interest in developing unsupervised generative models for anomaly detection. Generative models are training to model the distribution of the normal transaction data distribution without any annotations. A transaction that does not follow the distribution would be anomalous. Here you will evaluate several such unsupervised approaches used for credit card fraud detection such as Variational Autoencoder (VAE) and Generational Adversarial Networks (GAN), in Python. (30 points)
5) Describe the evaluation metrics you will use to evaluate this kind of dataset. Which ones will be appropriate and why? You should try several suitable methods and evaluate which one may be the most effective based on an appropriate evaluation metric of your choice. Provide your rationale for each method you try to handle the imbalanced dataset. Also provide a rationale for choosing your evaluation metric. (20 points)
6) Assess the various models, adjusting parameters appropriately and performing any necessary optimizations, and provide a visualization. Describe how you select the key parameters for each of the various models. (10 points)
7) What would be your assessment of the performance of the supervised models compared to that of the unsupervised methods? Explain the advantages and disadvantages of using supervised versus unsupervised methods for credit card fraud detection. (10 points)
8) Describe any limitations to your work. Describe further methods you could try to improve performance of credit card fraud detection. (5 points)
__MACOSX/57503/._cs-443-and-543-final-project-exam-isyin5im.docx
57503/57503_2.html
In [1]:
!wget https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
--2020-05-11 18:09:58-- https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 102634230 (98M) [text/plain]
Saving to: ‘creditcard.csv’
creditcard.csv 100%[===================>] 97.88M 170MB/s in 0.6s
2020-05-11 18:09:59 (170 MB/s) - ‘creditcard.csv’ saved [102634230/102634230]
In [2]:
ls
creditcard.csv sample_data/
Gather Sense of Our Data:¶
The first thing we must do is gather a basic sense of our data. Remember, except for the transaction and amount we dont know what the other columns are (due to privacy reasons). The only thing we know, is that those columns that are unknown have been scaled already.
Summary:¶
The transaction amount is relatively small.
The mean of all the mounts made is approximately USD 88.
There are no "Null" values, so we don't have to work on ways to replace values.
Most of the transactions were Non-Fraud (99.83%) of the time, while Fraud transactions occurs (017%) of the time in the dataframe.
In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time
# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv('creditcard.csv')
df.head()
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
/usr/local/lib/python3.6/dist-packages/sklearn/externals/six.py:31: FutureWarning: The module is deprecated in version 0.21 and will be removed in version 0.23 since we've dropped support for Python 2.7. Please rely on the official version of six (https://pypi.org/project/six/).
"(https://pypi.org/project/six/).", FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.neighbors.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.
warnings.warn(message, FutureWarning)
/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.metrics.classification module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.metrics. Anything that cannot be imported from sklearn.metrics is now part of the private API.
warnings.warn(message, FutureWarning)
Out[3]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0
In [4]:
df.describe()
Out[4]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
count 284807.000000 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 2.848070e+05 284807.000000 284807.000000
mean 94813.859575 1.758743e-12 -8.252298e-13 -9.636929e-13 8.316157e-13 1.591952e-13 4.247354e-13 -3.050180e-13 8.693344e-14 -1.179712e-12 7.094854e-13 1.875015e-12 1.053488e-12 7.137527e-13 -1.491363e-13 -5.225914e-13 -2.280712e-13 -6.428517e-13 4.958987e-13 7.060712e-13 1.766038e-12 -3.406543e-13 -5.713163e-13 -9.725303e-13 1.464139e-12 -6.989087e-13 -5.615260e-13 3.332112e-12 -3.518886e-12 88.349619 0.001727
std 47488.145955 1.958696e+00 1.651309e+00 1.516255e+00 1.415869e+00 1.380247e+00 1.332271e+00 1.237094e+00 1.194353e+00 1.098632e+00 1.088850e+00 1.020713e+00 9.992014e-01 9.952742e-01 9.585956e-01 9.153160e-01 8.762529e-01 8.493371e-01 8.381762e-01 8.140405e-01 7.709250e-01 7.345240e-01 7.257016e-01 6.244603e-01 6.056471e-01 5.212781e-01 4.822270e-01 4.036325e-01 3.300833e-01 250.120109 0.041527
min 0.000000 -5.640751e+01 -7.271573e+01 -4.832559e+01 -5.683171e+00 -1.137433e+02 -2.616051e+01 -4.355724e+01 -7.321672e+01 -1.343407e+01 -2.458826e+01 -4.797473e+00 -1.868371e+01 -5.791881e+00 -1.921433e+01 -4.498945e+00 -1.412985e+01 -2.516280e+01 -9.498746e+00 -7.213527e+00 -5.449772e+01 -3.483038e+01 -1.093314e+01 -4.480774e+01 -2.836627e+00 -1.029540e+01 -2.604551e+00 -2.256568e+01 -1.543008e+01 0.000000 0.000000
25% 54201.500000 -9.203734e-01 -5.985499e-01 -8.903648e-01 -8.486401e-01 -6.915971e-01 -7.682956e-01 -5.540759e-01 -2.086297e-01 -6.430976e-01 -5.354257e-01 -7.624942e-01 -4.055715e-01 -6.485393e-01 -4.255740e-01 -5.828843e-01 -4.680368e-01 -4.837483e-01 -4.988498e-01 -4.562989e-01 -2.117214e-01 -2.283949e-01 -5.423504e-01 -1.618463e-01 -3.545861e-01 -3.171451e-01 -3.269839e-01 -7.083953e-02 -5.295979e-02 5.600000 0.000000
50% 84692.000000 1.810880e-02 6.548556e-02 1.798463e-01 -1.984653e-02 -5.433583e-02 -2.741871e-01 4.010308e-02 2.235804e-02 -5.142873e-02 -9.291738e-02 -3.275735e-02 1.400326e-01 -1.356806e-02 5.060132e-02 4.807155e-02 6.641332e-02 -6.567575e-02 -3.636312e-03 3.734823e-03 -6.248109e-02 -2.945017e-02 6.781943e-03 -1.119293e-02 4.097606e-02 1.659350e-02 -5.213911e-02 1.342146e-03 1.124383e-02 22.000000 0.000000
75% 139320.500000 1.315642e+00 8.037239e-01 1.027196e+00 7.433413e-01 6.119264e-01 3.985649e-01 5.704361e-01 3.273459e-01 5.971390e-01 4.539234e-01 7.395934e-01 6.182380e-01 6.625050e-01 4.931498e-01 6.488208e-01 5.232963e-01 3.996750e-01 5.008067e-01 4.589494e-01 1.330408e-01 1.863772e-01 5.285536e-01 1.476421e-01 4.395266e-01 3.507156e-01 2.409522e-01 9.104512e-02 7.827995e-02 77.165000 0.000000
max 172792.000000 2.454930e+00 2.205773e+01 9.382558e+00 1.687534e+01 3.480167e+01 7.330163e+01 1.205895e+02 2.000721e+01 1.559499e+01 2.374514e+01 1.201891e+01 7.848392e+00 7.126883e+00 1.052677e+01 8.877742e+00 1.731511e+01 9.253526e+00 5.041069e+00 5.591971e+00 3.942090e+01 2.720284e+01 1.050309e+01 2.252841e+01 4.584549e+00 7.519589e+00 3.517346e+00 3.161220e+01 3.384781e+01 25691.160000 1.000000
In [5]:
# Good No Null Values!
df.isnull().sum().max()
Out[5]:
0
In [6]:
df.columns
Out[6]:
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
'Class'],
dtype='object')
In [7]:
# The classes are heavily skewed we need to solve this issue later.
print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')
No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset
In [8]:
colors = ["#0101DF", "#DF0101"]
sns.countplot('Class', data=df, palette=colors)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)
Out[8]:
Text(0.5, 1.0, 'Class Distributions \n (0: No Fraud || 1: Fraud)')
In [9]:
fig, ax = plt.subplots(1, 2, figsize=(18,4))
amount_val = df['Amount'].values
time_val = df['Time'].values
sns.distplot(amount_val, ax=ax[0], color='r')
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])
sns.distplot(time_val, ax=ax[1], color='b')
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])
plt.show()
In [0]:
# Since most of our data has already been scaled we should scale the columns that are left to scale (Amount and Time)
from sklearn.preprocessing import StandardScaler, RobustScaler
# RobustScaler is less prone to outliers.
std_scaler = StandardScaler()
rob_scaler = RobustScaler()
df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df.drop(['Time','Amount'], axis=1, inplace=True)
In [11]:
scaled_amount = df['scaled_amount']
scaled_time = df['scaled_time']
df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
df.insert(0, 'scaled_amount', scaled_amount)
df.insert(1, 'scaled_time', scaled_time)
# Amount and Time are Scaled!
df.head()
Out[11]:
scaled_amount scaled_time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Class
0 1.783274 -0.994983 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 0
1 -0.269825 -0.994983 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 0
2 4.983721 -0.994972 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 0
3 1.418291 -0.994972 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 0
4 0.670579 -0.994960 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 0
In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')
X = df.drop('Class', axis=1)
y = df['Class']
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
for train_index, test_index in sss.split(X, y):
print("Train:", train_index, "Test:", test_index)
original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]
# We already have X_train and y_train for undersample data thats why I am using original to distinguish and to not overwrite these variables.
# original_Xtrain, original_Xtest, original_ytrain, original_ytest = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the Distribution of the labels
# Turn into an array
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values
# See if both the train and test label distribution are similarly distributed
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)
print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))
No Frauds 99.83 % of the dataset
Frauds 0.17 % of the dataset
Train: [ 30473 30496 31002 ... 284804 284805 284806] Test: [ 0 1 2 ... 57017 57018 57019]
Train: [ 0 1 2 ... 284804 284805 284806] Test: [ 30473 30496 31002 ... 113964 113965 113966]
Train: [ 0 1 2 ... 284804 284805 284806] Test: [ 81609 82400 83053 ... 170946 170947 170948]
Train: [ 0 1 2 ... 284804 284805 284806] Test: [150654 150660 150661 ... 227866 227867 227868]
Train: [ 0 1 2 ... 227866 227867 227868] Test: [212516 212644 213092 ... 284804 284805 284806]
----------------------------------------------------------------------------------------------------
Label Distributions:
[0.99827076 0.00172924]
[0.99827952 0.00172048]
In [13]:
# Since our classes are highly skewed we should make them equivalent in order to have a normal distribution of the classes.
# Lets shuffle the data before creating the subsamples
df = df.sample(frac=1)
# amount of fraud classes 492 rows.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]
normal_distributed_df = pd.concat([fraud_df, non_fraud_df])
# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)
new_df.head()
Out[13]:
scaled_amount scaled_time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Class
70301 -0.293440 -0.362821 -1.309182 2.643591 -0.673922 2.856275 -0.569337 -0.807564 -0.158952 1.101358 -1.636385 1.075114 -0.991935 0.152057 0.380580 1.305396 0.638051 -0.100245 0.540601 -0.061017 0.684912 0.107871 0.205377 0.532638 0.229646 0.401670 -0.838382 0.113539 0.326430 0.243853 0
239499 0.391253 0.768877 -2.150855 2.187917 -3.430516 0.119476 -0.173210 0.290700 -2.808988 -2.679351 -0.556685 -4.485483 1.903999 -2.644219 -0.982273 -4.691151 -0.693080 -2.553251 -3.483436 -0.064852 1.490329 0.532145 -0.073205 0.561496 -0.075034 -0.437619 0.353841 -0.521339 0.144465 0.026588 1
242258 0.669322 0.783867 -1.088541 -1.340517 -0.746996 -1.989129 2.299595 -2.267388 0.443727 -0.411735 -1.629431 0.106115 -1.371872 -0.872795 0.421758 0.230802 -0.793717 0.693311 -0.139018 -1.470912 0.111376 0.497916 0.602484 1.133182 0.032524 -0.589040 0.117512 -0.091243 0.040797 0.202447 0
191267 -0.207084 0.522727 0.290155 0.049243 -0.740524 2.865463 1.395294 -0.535163 0.142543 -0.222770 -1.463691 1.713538 -1.127573 -0.708657 0.272186 0.274710 0.235192 -0.463553 0.472995 -0.447899 1.790924 0.247580 0.337349 1.018191 0.303550 0.833886 -1.222306 2.745261 -0.220402 0.168233 1
150662 4.108992 0.107626 -5.839192 7.151532 -12.816760 7.031115 -9.651272 -2.938427 -11.543207 4.843627 -3.494276 -13.320789 8.460244 -17.003289 0.101557 -14.094452 0.747031 -12.661696 -18.912494 -6.626975 4.008921 0.055684 2.462056 1.054865 0.530481 0.472670 -0.275998 0.282435 0.104886 0.254417 1
In [14]:
print('Distribution of the Classes in the subsample dataset')
print(new_df['Class'].value_counts()/len(new_df))
sns.countplot('Class', data=new_df, palette=colors)
plt.title('Equally Distributed Classes', fontsize=14)
plt.show()
Distribution of the Classes in the subsample dataset
1 0.5
0 0.5
Name: Class, dtype: float64
In [15]:
# Make sure we use the subsample in our correlation
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(24,20))
# Entire DataFrame
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Imbalanced Correlation Matrix \n (don't use for reference)", fontsize=14)
sub_sample_corr = new_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax2)
ax2.set_title('SubSample Correlation Matrix \n (use for reference)', fontsize=14)
plt.show()
In [16]:
# Anomaly Detection
from scipy.stats import norm
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20, 6))
v14_fraud_dist = new_df['V14'].loc[new_df['Class'] == 1].values
sns.distplot(v14_fraud_dist,ax=ax1, fit=norm, color='#FB8861')
ax1.set_title('V14 Distribution \n (Fraud Transactions)', fontsize=14)
v12_fraud_dist = new_df['V12'].loc[new_df['Class'] == 1].values
sns.distplot(v12_fraud_dist,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('V12 Distribution \n (Fraud Transactions)', fontsize=14)
v10_fraud_dist = new_df['V10'].loc[new_df['Class'] == 1].values
sns.distplot(v10_fraud_dist,ax=ax3, fit=norm, color='#C5B3F9')
ax3.set_title('V10 Distribution \n (Fraud Transactions)', fontsize=14)
plt.show()
In [17]:
f,(ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20,6))
colors = ['#B3F9C5', '#f9c5b3']
# Boxplots with outliers removed
# Feature V14
sns.boxplot(x="Class", y="V14", data=new_df,ax=ax1, palette=colors)
ax1.set_title("V14 Feature \n Reduction of outliers", fontsize=14)
ax1.annotate('Fewer extreme \n outliers', xy=(0.98, -17.5), xytext=(0, -12),
arrowprops=dict(facecolor='black'),
fontsize=14)
# Feature 12
sns.boxplot(x="Class", y="V12", data=new_df, ax=ax2, palette=colors)
ax2.set_title("V12 Feature \n Reduction of outliers", fontsize=14)
ax2.annotate('Fewer extreme \n outliers', xy=(0.98, -17.3), xytext=(0, -12),
arrowprops=dict(facecolor='black'),
fontsize=14)
# Feature V10
sns.boxplot(x="Class", y="V10", data=new_df, ax=ax3, palette=colors)
ax3.set_title("V10 Feature \n Reduction of outliers", fontsize=14)
ax3.annotate('Fewer extreme \n outliers', xy=(0.95, -16.5), xytext=(0, -12),
arrowprops=dict(facecolor='black'),
fontsize=14)
plt.show()
In [18]:
# New_df is from the random undersample data (fewer instances)
X = new_df.drop('Class', axis=1)
y = new_df['Class']
# T-SNE Implementation
t0 = time.time()
X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values)
t1 = time.time()
print("T-SNE took {:.2} s".format(t1 - t0))
# PCA Implementation
t0 = time.time()
X_reduced_pca = PCA(n_components=2, random_state=42).fit_transform(X.values)
t1 = time.time()
print("PCA took {:.2} s".format(t1 - t0))
# TruncatedSVD
t0 = time.time()
X_reduced_svd = TruncatedSVD(n_components=2, algorithm='randomized', random_state=42).fit_transform(X.values)
t1 = time.time()
print("Truncated SVD took {:.2} s".format(t1 - t0))
T-SNE took 6.2 s
PCA took 0.021 s
Truncated SVD took 0.0045 s
In [0]:
# Classifiers
In [0]:
# Undersampling before cross validating (prone to overfit)
X = new_df.drop('Class', axis=1)
y = new_df['Class']
In [0]:
# Our data is already scaled we should split our training and test sets
from sklearn.model_selection import train_test_split
# This is explicitly used for undersampling.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [0]:
# Turn the values into an array for feeding the classification algorithms.
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values
In [0]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
In [0]:
# Let's implement simple classifiers
classifiers = {
"LogisiticRegression": LogisticRegression(),
"KNearest": KNeighborsClassifier(),
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
"GradientBoostingClassifier":GradientBoostingClassifier(),
"RandomForestClassifier":RandomForestClassifier(),
"DecisionTreeClassifier":DecisionTreeClassifier()
}
In [27]:
# Wow our scores are getting even high scores even when applying cross validation.
from sklearn.model_selection import cross_val_score
for key, classifier in classifiers.items():
classifier.fit(X_train, y_train)
training_score = cross_val_score(classifier, X_train, y_train, cv=5)
print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
Classifiers: LogisticRegression Has a training score of 93.0 % accuracy score
Classifiers: KNeighborsClassifier Has a training score of 94.0 % accuracy score
Classifiers: SVC Has a training score of 94.0 % accuracy score
Classifiers: DecisionTreeClassifier Has a training score of 92.0 % accuracy score
Classifiers: GradientBoostingClassifier Has a training score of 94.0 % accuracy score
Classifiers: RandomForestClassifier Has a training score of 94.0 % accuracy score
In [0]:
# Use GridSearchCV to find the best parameters.
from sklearn.model_selection import GridSearchCV
# Logistic Regression
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params)
grid_log_reg.fit(X_train, y_train)
# We automatically get the logistic regression with the best parameters.
log_reg = grid_log_reg.best_estimator_
knears_params = {"n_neighbors": list(range(2,5,1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params)
grid_knears.fit(X_train, y_train)
# KNears best estimator
knears_neighbors = grid_knears.best_estimator_
# Support Vector Classifier
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train, y_train)
# SVC best estimator
svc = grid_svc.best_estimator_
# DecisionTree Classifier
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)),
"min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train, y_train)
# tree best estimator
tree_clf = grid_tree.best_estimator_
In [29]:
# Overfitting Case
log_reg_score = cross_val_score(log_reg, X_train, y_train, cv=5)
print('Logistic Regression Cross Validation Score: ', round(log_reg_score.mean() * 100, 2).astype(str) + '%')
knears_score = cross_val_score(knears_neighbors, X_train, y_train, cv=5)
print('Knears Neighbors Cross Validation Score', round(knears_score.mean() * 100, 2).astype(str) + '%')
svc_score = cross_val_score(svc, X_train, y_train, cv=5)
print('Support Vector Classifier Cross Validation Score', round(svc_score.mean() * 100, 2).astype(str) + '%')
tree_score = cross_val_score(tree_clf, X_train, y_train, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')
Logistic Regression Cross Validation Score: 93.91%
Knears Neighbors Cross Validation Score 93.78%
Support Vector Classifier Cross Validation Score 93.52%
DecisionTree Classifier Cross Validation Score 93.4%
In [30]:
# We will undersample during cross validating
undersample_X = df.drop('Class', axis=1)
undersample_y = df['Class']
for train_index, test_index in sss.split(undersample_X, undersample_y):
print("Train:", train_index, "Test:", test_index)
undersample_Xtrain, undersample_Xtest = undersample_X.iloc[train_index], undersample_X.iloc[test_index]
undersample_ytrain, undersample_ytest = undersample_y.iloc[train_index], undersample_y.iloc[test_index]
undersample_Xtrain = undersample_Xtrain.values
undersample_Xtest = undersample_Xtest.values
undersample_ytrain = undersample_ytrain.values
undersample_ytest = undersample_ytest.values
undersample_accuracy = []
undersample_precision = []
undersample_recall = []
undersample_f1 = []
undersample_auc = []
# Implementing NearMiss Technique
# Distribution of NearMiss (Just to see how it distributes the labels we won't use these variables)
X_nearmiss, y_nearmiss = NearMiss().fit_sample(undersample_X.values, undersample_y.values)
print('NearMiss Label Distribution: {}'.format(Counter(y_nearmiss)))
# Cross Validating the right way
for train, test in sss.split(undersample_Xtrain, undersample_ytrain):
undersample_pipeline = imbalanced_make_pipeline(NearMiss(sampling_strategy='majority'), log_reg) # SMOTE happens during Cross Validation not before..
undersample_model = undersample_pipeline.fit(undersample_Xtrain[train], undersample_ytrain[train])
undersample_prediction = undersample_model.predict(undersample_Xtrain[test])
undersample_accuracy.append(undersample_pipeline.score(original_Xtrain[test], original_ytrain[test]))
undersample_precision.append(precision_score(original_ytrain[test], undersample_prediction))
undersample_recall.append(recall_score(original_ytrain[test], undersample_prediction))
undersample_f1.append(f1_score(original_ytrain[test], undersample_prediction))
undersample_auc.append(roc_auc_score(original_ytrain[test], undersample_prediction))
Train: [ 56955 56956 56957 ... 284804 284805 284806] Test: [ 0 1 2 ... 62025 62116 62782]
Train: [ 0 1 2 ... 284804 284805 284806] Test: [ 56955 56956 56957 ... 113925 113926 113927]
Train: [ 0 1 2 ... 284804 284805 284806] Test: [112573 113154 113303 ... 170882 170916 171814]
Train: [ 0 1 2 ... 284804 284805 284806] Test: [170883 170884 170885 ... 227858 227859 227860]
Train: [ 0 1 2 ... 227858 227859 227860] Test: [218173 218495 218735 ... 284804 284805 284806]
NearMiss Label Distribution: Counter({0: 492, 1: 492})
In [32]:
# Let's Plot LogisticRegression Learning Curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator1, estimator2, estimator3, estimator4, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
f, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(20,14), sharey=True)
if ylim is not None:
plt.ylim(*ylim)
# First Estimator
train_sizes, train_scores, test_scores = learning_curve(
estimator1, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
ax1.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="#ff9124")
ax1.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
ax1.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
label="Training score")
ax1.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
label="Cross-validation score")
ax1.set_title("Logistic Regression Learning Curve", fontsize=14)
ax1.set_xlabel('Training size (m)')
ax1.set_ylabel('Score')
ax1.grid(True)
ax1.legend(loc="best")
# Second Estimator
train_sizes, train_scores, test_scores = learning_curve(
estimator2, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
ax2.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="#ff9124")
ax2.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
ax2.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
label="Training score")
ax2.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
label="Cross-validation score")
ax2.set_title("Knears Neighbors Learning Curve", fontsize=14)
ax2.set_xlabel('Training size (m)')
ax2.set_ylabel('Score')
ax2.grid(True)
ax2.legend(loc="best")
# Third Estimator
train_sizes, train_scores, test_scores = learning_curve(
estimator3, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
ax3.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="#ff9124")
ax3.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
ax3.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
label="Training score")
ax3.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
label="Cross-validation score")
ax3.set_title("Support Vector Classifier \n Learning Curve", fontsize=14)
ax3.set_xlabel('Training size (m)')
ax3.set_ylabel('Score')
ax3.grid(True)
ax3.legend(loc="best")
# Fourth Estimator
train_sizes, train_scores, test_scores = learning_curve(
estimator4, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
ax4.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="#ff9124")
ax4.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
ax4.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124",
label="Training score")
ax4.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff",
label="Cross-validation score")
ax4.set_title("Decision Tree Classifier \n Learning Curve", fontsize=14)
ax4.set_xlabel('Training size (m)')
ax4.set_ylabel('Score')
ax4.grid(True)
ax4.legend(loc="best")
return plt
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=42)
plot_learning_curve(log_reg, knears_neighbors, svc, tree_clf, X_train, y_train, (0.87, 1.01), cv=cv, n_jobs=4)
Out[32]:
In [0]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Create a DataFrame with all the scores and the classifiers names.
log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5,
method="decision_function")
knears_pred = cross_val_predict(knears_neighbors, X_train, y_train, cv=5)
svc_pred = cross_val_predict(svc, X_train, y_train, cv=5,
method="decision_function")
tree_pred = cross_val_predict(tree_clf, X_train, y_train, cv=5)
In [34]:
from sklearn.metrics import roc_auc_score
print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))
print('KNears Neighbors: ', roc_auc_score(y_train, knears_pred))
print('Support Vector Classifier: ', roc_auc_score(y_train, svc_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train, tree_pred))
Logistic Regression: 0.9771960442117512
KNears Neighbors: 0.9361612048348524
Support Vector Classifier: 0.9823411544179432
Decision Tree Classifier: 0.9332008273544051
In [0]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Create a DataFrame with all the scores and the classifiers names.
log_reg_pred = cross_val_predict(log_reg, X_train, y_train, cv=5,
method="decision_function")
knears_pred = cross_val_predict(knears_neighbors, X_train, y_train, cv=5)
svc_pred = cross_val_predict(svc, X_train, y_train, cv=5,
method="decision_function")
tree_pred = cross_val_predict(tree_clf, X_train, y_train, cv=5)
In [36]:
from sklearn.metrics import roc_auc_score
print('Logistic Regression: ', roc_auc_score(y_train, log_reg_pred))
print('KNears Neighbors: ', roc_auc_score(y_train, knears_pred))
print('Support Vector Classifier: ', roc_auc_score(y_train, svc_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train, tree_pred))
Logistic Regression: 0.9771960442117512
KNears Neighbors: 0.9361612048348524
Support Vector Classifier: 0.9823411544179432
Decision Tree Classifier: 0.9332008273544051
In [37]:
log_fpr, log_tpr, log_thresold = roc_curve(y_train, log_reg_pred)
knear_fpr, knear_tpr, knear_threshold = roc_curve(y_train, knears_pred)
svc_fpr, svc_tpr, svc_threshold = roc_curve(y_train, svc_pred)
tree_fpr, tree_tpr, tree_threshold = roc_curve(y_train, tree_pred)
def graph_roc_curve_multiple(log_fpr, log_tpr, knear_fpr, knear_tpr, svc_fpr, svc_tpr, tree_fpr, tree_tpr):
plt.figure(figsize=(16,8))
plt.title('ROC Curve \n Top 4 Classifiers', fontsize=18)
plt.plot(log_fpr, log_tpr, label='Logistic Regression Classifier Score: {:.4f}'.format(roc_auc_score(y_train, log_reg_pred)))
plt.plot(knear_fpr, knear_tpr, label='KNears Neighbors Classifier Score: {:.4f}'.format(roc_auc_score(y_train, knears_pred)))
plt.plot(svc_fpr, svc_tpr, label='Support Vector Classifier Score: {:.4f}'.format(roc_auc_score(y_train, svc_pred)))
plt.plot(tree_fpr, tree_tpr, label='Decision Tree Classifier Score: {:.4f}'.format(roc_auc_score(y_train, tree_pred)))
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([-0.01, 1, 0, 1])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.annotate('Minimum ROC Score of 50% \n (This is the minimum score to get)', xy=(0.5, 0.5), xytext=(0.6, 0.3),
arrowprops=dict(facecolor='#6E726D', shrink=0.05),
)
plt.legend()
graph_roc_curve_multiple(log_fpr, log_tpr, knear_fpr, knear_tpr, svc_fpr, svc_tpr, tree_fpr, tree_tpr)
plt.show()
In [38]:
def logistic_roc_curve(log_fpr, log_tpr):
plt.figure(figsize=(12,8))
plt.title('Logistic Regression ROC Curve', fontsize=16)
plt.plot(log_fpr, log_tpr, 'b-', linewidth=2)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.axis([-0.01,1,0,1])
logistic_roc_curve(log_fpr, log_tpr)
plt.show()
In [0]:
from sklearn.metrics import precision_recall_curve
precision, recall, threshold = precision_recall_curve(y_train, log_reg_pred)
In [40]:
y_pred = log_reg.predict(X_train)
# Overfitting Case
print('---' * 45)
print('Overfitting: \n')
print('Recall Score: {:.2f}'.format(recall_score(y_train, y_pred)))
print('Precision Score: {:.2f}'.format(precision_score(y_train, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_train, y_pred)))
print('Accuracy Score: {:.2f}'.format(accuracy_score(y_train, y_pred)))
print('---' * 45)
# How it should look like
print('---' * 45)
print('How it should be:\n')
print("Accuracy Score: {:.2f}".format(np.mean(undersample_accuracy)))
print("Precision Score: {:.2f}".format(np.mean(undersample_precision)))
print("Recall Score: {:.2f}".format(np.mean(undersample_recall)))
print("F1 Score: {:.2f}".format(np.mean(undersample_f1)))
print('---' * 45)
---------------------------------------------------------------------------------------------------------------------------------------
Overfitting:
Recall Score: 0.92
Precision Score: 0.77
F1 Score: 0.84
Accuracy Score: 0.83
---------------------------------------------------------------------------------------------------------------------------------------
---------------------------------------------------------------------------------------------------------------------------------------
How it should be:
Accuracy Score: 0.83
Precision Score: 0.00
Recall Score: 0.18
F1 Score: 0.00
---------------------------------------------------------------------------------------------------------------------------------------
In [41]:
undersample_y_score = log_reg.decision_function(original_Xtest)
from sklearn.metrics import average_precision_score
undersample_average_precision = average_precision_score(original_ytest, undersample_y_score)
print('Average precision-recall score: {0:0.2f}'.format(
undersample_average_precision))
Average precision-recall score: 0.05
In [42]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,6))
precision, recall, _ = precision_recall_curve(original_ytest, undersample_y_score)
plt.step(recall, precision, color='#004a93', alpha=0.2,
where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
color='#48a6ff')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('UnderSampling Precision-Recall curve: \n Average Precision-Recall Score ={0:0.2f}'.format(
undersample_average_precision), fontsize=16)
Out[42]:
Text(0.5, 1.0, 'UnderSampling Precision-Recall curve: \n Average Precision-Recall Score =0.05')
In [43]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
print('Length of X (train): {} | Length of y (train): {}'.format(len(original_Xtrain), len(original_ytrain)))
print('Length of X (test): {} | Length of y (test): {}'.format(len(original_Xtest), len(original_ytest)))
# List to append the score and then find the average
accuracy_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []
# Classifier with optimal parameters
# log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm = LogisticRegression()
rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4)
# Implementing SMOTE Technique
# Cross Validating the right way
# Parameters
log_reg_params = {"penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
for train, test in sss.split(original_Xtrain, original_ytrain):
pipeline = imbalanced_make_pipeline(SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before..
model = pipeline.fit(original_Xtrain[train], original_ytrain[train])
best_est = rand_log_reg.best_estimator_
prediction = best_est.predict(original_Xtrain[test])
accuracy_lst.append(pipeline.score(original_Xtrain[test], original_ytrain[test]))
precision_lst.append(precision_score(original_ytrain[test], prediction))
recall_lst.append(recall_score(original_ytrain[test], prediction))
f1_lst.append(f1_score(original_ytrain[test], prediction))
auc_lst.append(roc_auc_score(original_ytrain[test], prediction))
print('---' * 45)
print('')
print("accuracy: {}".format(np.mean(accuracy_lst)))
print("precision: {}".format(np.mean(precision_lst)))
print("recall: {}".format(np.mean(recall_lst)))
print("f1: {}".format(np.mean(f1_lst)))
print('---' * 45)
Length of X (train): 227846 | Length of y (train): 227846
Length of X (test): 56961 | Length of y (test): 56961
---------------------------------------------------------------------------------------------------------------------------------------
accuracy: 0.9419351252041981
precision: 0.06018427458248645
recall: 0.9111976630963973
f1: 0.11112620278189447
---------------------------------------------------------------------------------------------------------------------------------------
In [44]:
labels = ['No Fraud', 'Fraud']
smote_prediction = best_est.predict(original_Xtest)
print(classification_report(original_ytest, smote_prediction, target_names=labels))
precision recall f1-score support
No Fraud 1.00 0.99 0.99 56863
Fraud 0.11 0.86 0.20 98
accuracy 0.99 56961
macro avg 0.55 0.92 0.59 56961
weighted avg 1.00 0.99 0.99 56961
In [45]:
y_score = best_est.decision_function(original_Xtest)
average_precision = average_precision_score(original_ytest, y_score)
print('Average precision-recall score: {0:0.2f}'.format(
average_precision))
fig = plt.figure(figsize=(12,6))
precision, recall, _ = precision_recall_curve(original_ytest, y_score)
plt.step(recall, precision, color='r', alpha=0.2,
where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
color='#F59B00')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('OverSampling Precision-Recall curve: \n Average Precision-Recall Score ={0:0.2f}'.format(
average_precision), fontsize=16)
Average precision-recall score: 0.74
Out[45]:
Text(0.5, 1.0, 'OverSampling Precision-Recall curve: \n Average Precision-Recall Score =0.74')
In [0]:
# SMOTE Technique (OverSampling) After splitting and Cross Validating
sm = SMOTE(ratio='minority', random_state=42)
# Xsm_train, ysm_train = sm.fit_sample(X_train, y_train)
# This will be the data were we are going to
Xsm_train, ysm_train = sm.fit_sample(original_Xtrain, original_ytrain)
In [47]:
# We Improve the score by 2% points approximately
# Implement GridSearchCV and the other models.
# Logistic Regression
t0 = time.time()
log_reg_sm = grid_log_reg.best_estimator_
log_reg_sm.fit(Xsm_train, ysm_train)
t1 = time.time()
print("Fitting oversample data took :{} sec".format(t1 - t0))
Fitting oversample data took :9.559972286224365 sec
In [48]:
from sklearn.metrics import confusion_matrix
# Logistic Regression fitted using SMOTE technique
y_pred_log_reg = log_reg_sm.predict(X_test)
# Other models fitted with UnderSampling
y_pred_knear = knears_neighbors.predict(X_test)
y_pred_svc = svc.predict(X_test)
y_pred_tree = tree_clf.predict(X_test)
log_reg_cf = confusion_matrix(y_test, y_pred_log_reg)
kneighbors_cf = confusion_matrix(y_test, y_pred_knear)
svc_cf = confusion_matrix(y_test, y_pred_svc)
tree_cf = confusion_matrix(y_test, y_pred_tree)
fig, ax = plt.subplots(2, 2,figsize=(22,12))
sns.heatmap(log_reg_cf, ax=ax[0][0], annot=True, cmap=plt.cm.copper)
ax[0, 0].set_title("Logistic Regression \n Confusion Matrix", fontsize=14)
ax[0, 0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0, 0].set_yticklabels(['', ''], fontsize=14, rotation=360)
sns.heatmap(kneighbors_cf, ax=ax[0][1], annot=True, cmap=plt.cm.copper)
ax[0][1].set_title("KNearsNeighbors \n Confusion Matrix", fontsize=14)
ax[0][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0][1].set_yticklabels(['', ''], fontsize=14, rotation=360)
sns.heatmap(svc_cf, ax=ax[1][0], annot=True, cmap=plt.cm.copper)
ax[1][0].set_title("Suppor Vector Classifier \n Confusion Matrix", fontsize=14)
ax[1][0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][0].set_yticklabels(['', ''], fontsize=14, rotation=360)
sns.heatmap(tree_cf, ax=ax[1][1], annot=True, cmap=plt.cm.copper)
ax[1][1].set_title("DecisionTree Classifier \n Confusion Matrix", fontsize=14)
ax[1][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][1].set_yticklabels(['', ''], fontsize=14, rotation=360)
plt.show()
In [49]:
from sklearn.metrics import classification_report
print('Logistic Regression:')
print(classification_report(y_test, y_pred_log_reg))
print('KNears Neighbors:')
print(classification_report(y_test, y_pred_knear))
print('Support Vector Classifier:')
print(classification_report(y_test, y_pred_svc))
print('Support Vector Classifier:')
print(classification_report(y_test, y_pred_tree))
Logistic Regression:
precision recall f1-score support
0 0.90 0.94 0.92 87
1 0.95 0.92 0.94 110
accuracy 0.93 197
macro avg 0.93 0.93 0.93 197
weighted avg 0.93 0.93 0.93 197
KNears Neighbors:
precision recall f1-score support
0 0.87 0.92 0.89 87
1 0.93 0.89 0.91 110
accuracy 0.90 197
macro avg 0.90 0.91 0.90 197
weighted avg 0.91 0.90 0.90 197
Support Vector Classifier:
precision recall f1-score support
0 0.86 0.94 0.90 87
1 0.95 0.88 0.92 110
accuracy 0.91 197
macro avg 0.91 0.91 0.91 197
weighted avg 0.91 0.91 0.91 197
Support Vector Classifier:
precision recall f1-score support
0 0.89 0.93 0.91 87
1 0.94 0.91 0.93 110
accuracy 0.92 197
macro avg 0.92 0.92 0.92 197
weighted avg 0.92 0.92 0.92 197
Neural Nets and other stuff¶
In [50]:
import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Activation
from keras.layers.core import Dense
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy
n_inputs = X_train.shape[1]
undersample_model = Sequential([
Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
Dense(32, activation='relu'),
Dense(2, activation='softmax')
])
Using TensorFlow backend.
In [51]:
undersample_model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_1 (Dense) (None, 30) 930
_________________________________________________________________
dense_2 (Dense) (None, 32) 992
_________________________________________________________________
dense_3 (Dense) (None, 2) 66
=================================================================
Total params: 1,988
Trainable params: 1,988
Non-trainable params: 0
_________________________________________________________________
In [0]:
undersample_model.compile(Adam(lr=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
In [53]:
undersample_model.fit(X_train, y_train, validation_split=0.2, batch_size=25, epochs=20, shuffle=True, verbose=2)
Train on 629 samples, validate on 158 samples
Epoch 1/20
- 0s - loss: 0.4561 - accuracy: 0.8283 - val_loss: 0.3323 - val_accuracy: 0.8671
Epoch 2/20
- 0s - loss: 0.2677 - accuracy: 0.9046 - val_loss: 0.2636 - val_accuracy: 0.9177
Epoch 3/20
- 0s - loss: 0.2158 - accuracy: 0.9316 - val_loss: 0.2330 - val_accuracy: 0.9304
Epoch 4/20
- 0s - loss: 0.1805 - accuracy: 0.9412 - val_loss: 0.2107 - val_accuracy: 0.9177
Epoch 5/20
- 0s - loss: 0.1608 - accuracy: 0.9444 - val_loss: 0.1956 - val_accuracy: 0.9304
Epoch 6/20
- 0s - loss: 0.1436 - accuracy: 0.9507 - val_loss: 0.1883 - val_accuracy: 0.9304
Epoch 7/20
- 0s - loss: 0.1334 - accuracy: 0.9523 - val_loss: 0.1792 - val_accuracy: 0.9177
Epoch 8/20
- 0s - loss: 0.1237 - accuracy: 0.9539 - val_loss: 0.1754 - val_accuracy: 0.9114
Epoch 9/20
- 0s - loss: 0.1157 - accuracy: 0.9539 - val_loss: 0.1701 - val_accuracy: 0.9177
Epoch 10/20
- 0s - loss: 0.1078 - accuracy: 0.9555 - val_loss: 0.1686 - val_accuracy: 0.9177
Epoch 11/20
- 0s - loss: 0.1019 - accuracy: 0.9555 - val_loss: 0.1646 - val_accuracy: 0.9177
Epoch 12/20
- 0s - loss: 0.0958 - accuracy: 0.9634 - val_loss: 0.1637 - val_accuracy: 0.9177
Epoch 13/20
- 0s - loss: 0.0917 - accuracy: 0.9650 - val_loss: 0.1601 - val_accuracy: 0.9304
Epoch 14/20
- 0s - loss: 0.0867 - accuracy: 0.9698 - val_loss: 0.1595 - val_accuracy: 0.9241
Epoch 15/20
- 0s - loss: 0.0813 - accuracy: 0.9714 - val_loss: 0.1615 - val_accuracy: 0.9177
Epoch 16/20
- 0s - loss: 0.0786 - accuracy: 0.9634 - val_loss: 0.1583 - val_accuracy: 0.9114
Epoch 17/20
- 0s - loss: 0.0746 - accuracy: 0.9730 - val_loss: 0.1556 - val_accuracy: 0.9177
Epoch 18/20
- 0s - loss: 0.0692 - accuracy: 0.9714 - val_loss: 0.1565 - val_accuracy: 0.9177
Epoch 19/20
- 0s - loss: 0.0654 - accuracy: 0.9746 - val_loss: 0.1584 - val_accuracy: 0.9177
Epoch 20/20
- 0s - loss: 0.0628 - accuracy: 0.9746 - val_loss: 0.1576 - val_accuracy: 0.9304
Out[53]:
In [0]:
undersample_predictions = undersample_model.predict(original_Xtest, batch_size=200, verbose=0)
In [0]:
undersample_fraud_predictions = undersample_model.predict_classes(original_Xtest, batch_size=200, verbose=0)
In [0]:
import itertools
# Create a confusion matrix
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, fontsize=14)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [57]:
undersample_cm = confusion_matrix(original_ytest, undersample_fraud_predictions)
actual_cm = confusion_matrix(original_ytest, original_ytest)
labels = ['No Fraud', 'Fraud']
fig = plt.figure(figsize=(16,8))
fig.add_subplot(221)
plot_confusion_matrix(undersample_cm, labels, title="Random UnderSample \n Confusion Matrix", cmap=plt.cm.Reds)
fig.add_subplot(222)
plot_confusion_matrix(actual_cm, labels, title="Confusion Matrix \n (with 100% accuracy)", cmap=plt.cm.Greens)
Confusion matrix, without normalization
[[54584 2279]
[ 9 89]]
Confusion matrix, without normalization
[[56863 0]
[ 0 98]]
In [0]:
n_inputs = Xsm_train.shape[1]
oversample_model = Sequential([
Dense(n_inputs, input_shape=(n_inputs, ), activation='relu'),
Dense(32, activation='relu'),
Dense(2, activation='softmax')
])
In [0]:
oversample_model.compile(Adam(lr=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
In [60]:
oversample_model.fit(Xsm_train, ysm_train, validation_split=0.2, batch_size=300, epochs=20, shuffle=True, verbose=2)
Train on 363923 samples, validate on 90981 samples
Epoch 1/20
- 2s - loss: 0.0624 - accuracy: 0.9765 - val_loss: 0.0155 - val_accuracy: 0.9973
Epoch 2/20
- 2s - loss: 0.0126 - accuracy: 0.9974 - val_loss: 0.0075 - val_accuracy: 1.0000
Epoch 3/20
- 2s - loss: 0.0072 - accuracy: 0.9986 - val_loss: 0.0037 - val_accuracy: 1.0000
Epoch 4/20
- 2s - loss: 0.0052 - accuracy: 0.9990 - val_loss: 0.0024 - val_accuracy: 1.0000
Epoch 5/20
- 2s - loss: 0.0038 - accuracy: 0.9993 - val_loss: 0.0018 - val_accuracy: 1.0000
Epoch 6/20
- 2s - loss: 0.0032 - accuracy: 0.9994 - val_loss: 6.7341e-04 - val_accuracy: 1.0000
Epoch 7/20
- 2s - loss: 0.0027 - accuracy: 0.9995 - val_loss: 0.0012 - val_accuracy: 0.9999
Epoch 8/20
- 2s - loss: 0.0026 - accuracy: 0.9995 - val_loss: 5.4168e-04 - val_accuracy: 1.0000
Epoch 9/20
- 2s - loss: 0.0020 - accuracy: 0.9996 - val_loss: 0.0030 - val_accuracy: 0.9999
Epoch 10/20
- 2s - loss: 0.0018 - accuracy: 0.9997 - val_loss: 0.0013 - val_accuracy: 0.9999
Epoch 11/20
- 2s - loss: 0.0018 - accuracy: 0.9996 - val_loss: 4.5583e-04 - val_accuracy: 1.0000
Epoch 12/20
- 2s - loss: 0.0014 - accuracy: 0.9997 - val_loss: 3.1518e-04 - val_accuracy: 1.0000
Epoch 13/20
- 2s - loss: 0.0013 - accuracy: 0.9997 - val_loss: 3.5381e-04 - val_accuracy: 1.0000
Epoch 14/20
- 2s - loss: 0.0012 - accuracy: 0.9997 - val_loss: 8.6079e-04 - val_accuracy: 1.0000
Epoch 15/20
- 2s - loss: 0.0012 - accuracy: 0.9998 - val_loss: 2.2759e-04 - val_accuracy: 1.0000
Epoch 16/20
- 2s - loss: 0.0011 - accuracy: 0.9997 - val_loss: 4.8317e-04 - val_accuracy: 1.0000
Epoch 17/20
- 2s - loss: 0.0010 - accuracy: 0.9998 - val_loss: 3.2697e-04 - val_accuracy: 1.0000
Epoch 18/20
- 2s - loss: 0.0011 - accuracy: 0.9998 - val_loss: 2.1950e-04 - val_accuracy: 1.0000
Epoch 19/20
- 2s - loss: 8.0063e-04 - accuracy: 0.9998 - val_loss: 1.4108e-04 - val_accuracy: 1.0000
Epoch 20/20
- 2s - loss: 9.2644e-04 - accuracy: 0.9998 - val_loss: 0.0013 - val_accuracy: 1.0000
Out[60]:
In [0]:
oversample_predictions = oversample_model.predict(original_Xtest, batch_size=200, verbose=0)
In [0]:
oversample_fraud_predictions = oversample_model.predict_classes(original_Xtest, batch_size=200, verbose=0)
In [63]:
oversample_smote = confusion_matrix(original_ytest, oversample_fraud_predictions)
actual_cm = confusion_matrix(original_ytest, original_ytest)
labels = ['No Fraud', 'Fraud']
fig = plt.figure(figsize=(16,8))
fig.add_subplot(221)
plot_confusion_matrix(oversample_smote, labels, title="OverSample (SMOTE) \n Confusion Matrix", cmap=plt.cm.Oranges)
fig.add_subplot(222)
plot_confusion_matrix(actual_cm, labels, title="Confusion Matrix \n (with 100% accuracy)", cmap=plt.cm.Greens)
Confusion matrix, without normalization
[[56848 15]
[ 30 68]]
Confusion matrix, without normalization
[[56863 0]
[ 0 98]]
Implementing SMOTE on our imbalanced dataset helped us with the imbalance of our labels (more no fraud than fraud transactions). Nevertheless, I still have to state that sometimes the neural network on the oversampled dataset predicts less correct fraud transactions than our model using the undersample dataset. However, remember that the removal of outliers was implemented only on the random undersample dataset and not on the oversampled one. Also, in our undersample data our model is unable to detect for a large number of cases non fraud transactions correctly and instead, misclassifies those non fraud transactions as fraud cases. Imagine that people that were making regular purchases got their card blocked due to the reason that our model classified that transaction as a fraud transaction, this will be a huge disadvantage for the financial institution. The number of customer complaints and customer disatisfaction will increase. The next step of this analysis will be to do an outlier removal on our oversample dataset and see if our accuracy in the test set improves.
Autoencoder¶
In [0]:
import numpy as np
import pandas as pd
import zipfile
from keras.preprocessing.image import ImageDataGenerator, load_img
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from keras.utils import to_categorical
import matplotlib.pyplot as plt
import random
from subprocess import check_output
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import os
import pickle
from scipy import stats
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, load_model,Sequential
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
from tensorflow.keras import regularizers
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc,roc_curve, recall_score, classification_report, f1_score,precision_recall_fscore_support
from sklearn.model_selection import train_test_split
%matplotlib inline
In [65]:
data = pd.read_csv("creditcard.csv")
data.head(10)
Out[65]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0
5 2.0 -0.425966 0.960523 1.141109 -0.168252 0.420987 -0.029728 0.476201 0.260314 -0.568671 -0.371407 1.341262 0.359894 -0.358091 -0.137134 0.517617 0.401726 -0.058133 0.068653 -0.033194 0.084968 -0.208254 -0.559825 -0.026398 -0.371427 -0.232794 0.105915 0.253844 0.081080 3.67 0
6 4.0 1.229658 0.141004 0.045371 1.202613 0.191881 0.272708 -0.005159 0.081213 0.464960 -0.099254 -1.416907 -0.153826 -0.751063 0.167372 0.050144 -0.443587 0.002821 -0.611987 -0.045575 -0.219633 -0.167716 -0.270710 -0.154104 -0.780055 0.750137 -0.257237 0.034507 0.005168 4.99 0
7 7.0 -0.644269 1.417964 1.074380 -0.492199 0.948934 0.428118 1.120631 -3.807864 0.615375 1.249376 -0.619468 0.291474 1.757964 -1.323865 0.686133 -0.076127 -1.222127 -0.358222 0.324505 -0.156742 1.943465 -1.015455 0.057504 -0.649709 -0.415267 -0.051634 -1.206921 -1.085339 40.80 0
8 7.0 -0.894286 0.286157 -0.113192 -0.271526 2.669599 3.721818 0.370145 0.851084 -0.392048 -0.410430 -0.705117 -0.110452 -0.286254 0.074355 -0.328783 -0.210077 -0.499768 0.118765 0.570328 0.052736 -0.073425 -0.268092 -0.204233 1.011592 0.373205 -0.384157 0.011747 0.142404 93.20 0
9 9.0 -0.338262 1.119593 1.044367 -0.222187 0.499361 -0.246761 0.651583 0.069539 -0.736727 -0.366846 1.017614 0.836390 1.006844 -0.443523 0.150219 0.739453 -0.540980 0.476677 0.451773 0.203711 -0.246914 -0.633753 -0.120794 -0.385050 -0.069733 0.094199 0.246219 0.083076 3.68 0
In [66]:
# Check how much examples we have per Class
count_classes = pd.value_counts(data['Class'], sort = True).sort_index()
df = pd.DataFrame({'Class0':count_classes[0], 'Class1*100':count_classes[1]*100}, index=['N']) #
df.plot(kind='bar')
plt.title("Histogram")
plt.xlabel("Class")
plt.ylabel("Count")
print("count for each class : \n\n" + str(count_classes) + "\n")
print ('percent of fraud : ' + str(100*count_classes[1]/(sum(count_classes))) + "%")
count for each class :
0 284315
1 492
Name: Class, dtype: int64
percent of fraud : 0.1727485630620034%
In [67]:
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
#data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))
data = data.drop(['Time'],axis=1) #,'Amount'
data.head()
Out[67]:
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 0.244964 0
1 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 -0.342475 0
2 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 1.160686 0
3 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 0.140534 0
4 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 -0.073403 0
In [0]:
# Test train split
bad_data = data[data['Class'] == 1]
bad_data2=bad_data['Class']
bad_data=bad_data.drop(['Class'], axis=1)
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)
X_train = X_train[X_train['Class']==0]
X_train = X_train.drop(['Class'], axis=1)
y_test = X_test['Class']
X_test = X_test.drop(['Class'], axis=1)
X_train = X_train.values
X_test = X_test.values
X_test2=np.concatenate([bad_data, X_test])
y_test2 = np.concatenate([bad_data2, y_test])
In [0]:
# Model params
input_dim = X_train.shape[1]
encoding_dim = 14
nb_epoch = 15
batch_size = 32
In [70]:
input_layer = Input(shape=(input_dim, ))
encoder = Dense(18, activation="relu")(input_layer)
encoder = Dense(encoding_dim, activation="relu", activity_regularizer=regularizers.l2(10e-3))(encoder) #regularizers.l1(10e-5)
encoder = Dense(int(encoding_dim / 2)+2, activation="relu")(encoder)
decoder = Dense(encoding_dim, activation='relu')(encoder)
decoder = Dense(input_dim, activation='relu')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
autoencoder.summary()
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 29)] 0
_________________________________________________________________
dense (Dense) (None, 18) 540
_________________________________________________________________
dense_1 (Dense) (None, 14) 266
_________________________________________________________________
dense_2 (Dense) (None, 9) 135
_________________________________________________________________
dense_3 (Dense) (None, 14) 140
_________________________________________________________________
dense_4 (Dense) (None, 29) 435
=================================================================
Total params: 1,516
Trainable params: 1,516
Non-trainable params: 0
_________________________________________________________________
In [71]:
checkpointer = ModelCheckpoint(filepath="fraud_model.h5",verbose=0,save_best_only=True)
tensorboard = TensorBoard(log_dir='./logs',histogram_freq=0,write_graph=True,write_images=True)
history = autoencoder.fit(X_train, X_train,epochs=nb_epoch,batch_size=batch_size,shuffle=True,validation_data=(X_test2, X_test2),verbose=1,callbacks=[checkpointer, tensorboard]).history
Epoch 1/15
7108/7108 [==============================] - 16s 2ms/step - loss: 0.8198 - accuracy: 0.5712 - val_loss: 1.0197 - val_accuracy: 0.6713
Epoch 2/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7421 - accuracy: 0.6959 - val_loss: 0.9948 - val_accuracy: 0.7038
Epoch 3/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7225 - accuracy: 0.7296 - val_loss: 0.9727 - val_accuracy: 0.7415
Epoch 4/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7134 - accuracy: 0.7466 - val_loss: 0.9709 - val_accuracy: 0.7461
Epoch 5/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7088 - accuracy: 0.7547 - val_loss: 0.9629 - val_accuracy: 0.7603
Epoch 6/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7066 - accuracy: 0.7601 - val_loss: 0.9603 - val_accuracy: 0.7593
Epoch 7/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7047 - accuracy: 0.7632 - val_loss: 0.9668 - val_accuracy: 0.7578
Epoch 8/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7028 - accuracy: 0.7642 - val_loss: 0.9641 - val_accuracy: 0.7572
Epoch 9/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7026 - accuracy: 0.7631 - val_loss: 0.9557 - val_accuracy: 0.7661
Epoch 10/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.7010 - accuracy: 0.7654 - val_loss: 0.9526 - val_accuracy: 0.7703
Epoch 11/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.6981 - accuracy: 0.7754 - val_loss: 0.9522 - val_accuracy: 0.7840
Epoch 12/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.6957 - accuracy: 0.7816 - val_loss: 0.9471 - val_accuracy: 0.7827
Epoch 13/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.6950 - accuracy: 0.7843 - val_loss: 0.9482 - val_accuracy: 0.7806
Epoch 14/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.6936 - accuracy: 0.7851 - val_loss: 0.9494 - val_accuracy: 0.7936
Epoch 15/15
7108/7108 [==============================] - 14s 2ms/step - loss: 0.6919 - accuracy: 0.7891 - val_loss: 0.9458 - val_accuracy: 0.7838
In [72]:
autoencoder = load_model('fraud_model.h5')
plt.figure(figsize=(12,12))
plt.subplot(321)
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('Model loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train loss', 'test loss']);#loc='upper right'
plt.subplot(322)
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.ylim(0.4,1.1)
plt.legend(['train accuracy','test accuracy']);#loc='upper right'
In [0]:
predictions = autoencoder.predict(X_test2)
mse = np.mean(np.power(X_test2 - predictions, 2), axis=1)
error_df = pd.DataFrame({'reconstruction_error': mse,
'true_class': y_test2})
In [74]:
precision, recall, th = precision_recall_curve(error_df.true_class, error_df.reconstruction_error)
auprc = auc(recall, precision)
plt.plot(recall, precision, 'b')
plt.title('Precision Vs Recall over Threshold')
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend(['AUPRC = %0.4f'% auprc])
plt.show()
In [75]:
F1_score=[]
F1_score = 2 * (precision * recall) / (precision + recall)
print('Maximal F1 Score : ', str(max(F1_score)))
optimal_th = th[np.argmax(F1_score)]
Maximal F1 Score : 0.5047770700636943
In [76]:
plt.plot(th, precision[1:], 'r', label='Threshold-Precision curve')
plt.plot(th, recall[1:], 'b', label='Threshold-Recall curve')
plt.plot(th, F1_score[1:], 'g', label='Threshold-Recall curve')
plt.title('Precision & Recall & F1 score Vs Threshold')
plt.xlabel('threshold')
plt.ylabel('Precision & Recall & F1')
plt.legend(['Precision','Recall', 'F1 score'])
plt.show()
In [77]:
groups = error_df.groupby('true_class')
fig, ax = plt.subplots()
for name, group in groups:
ax.plot(group.index, group.reconstruction_error, marker='o', ms=3.5, linestyle='',
label= "Fraud" if name == 1 else "Normal")
ax.hlines(optimal_th, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
ax.legend()
plt.title("MSE error Vs Data point index")
plt.ylabel("MSE error")
plt.xlabel("Data point index")
plt.show();
Gan¶
In [0]:
df = pd.read_csv('creditcard.csv')
In [116]:
df.head()
Out[116]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 0.090794 -0.551600 -0.617801 -0.991390 -0.311169 1.468177 -0.470401 0.207971 0.025791 0.403993 0.251412 -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 -0.166974 1.612727 1.065235 0.489095 -0.143772 0.635558 0.463917 -0.114805 -0.183361 -0.145783 -0.069083 -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 0.207643 0.624501 0.066084 0.717293 -0.165946 2.345865 -2.890083 1.109969 -0.121359 -2.261857 0.524980 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 -0.054952 -0.226487 0.178228 0.507757 -0.287924 -0.631418 -1.059647 -0.684093 1.965775 -1.232622 -0.208038 -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 0.753074 -0.822843 0.538196 1.345852 -1.119670 0.175121 -0.451449 -0.237033 -0.038195 0.803487 0.408542 -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 0
In [0]:
df = df.drop('Time',axis=1)
In [0]:
X = df.drop('Class',axis=1).values
y = df['Class'].values
In [119]:
X.shape
Out[119]:
(284807, 29)
In [0]:
X -= X.min(axis=0)
X /= X.max(axis=0)
In [121]:
X.mean()
Out[121]:
0.5213456986239168
In [122]:
X.shape
Out[122]:
(284807, 29)
In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.1)
In [0]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from keras.layers import Input, Embedding, multiply, BatchNormalization
from keras.models import Model, Sequential
from keras.layers.core import Reshape, Dense, Dropout, Flatten
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import Conv2D, UpSampling2D
from keras.datasets import mnist
from keras.optimizers import Adam
from keras import backend as K
from keras import initializers
from keras.utils import to_categorical
# The results are a little better when the dimensionality of the random vector is only 10.
# The dimensionality has been left at 100 for consistency with other GAN implementations.
randomDim = 100
In [0]:
def build_generator(latent_dim,data_dim):
model = Sequential()
model.add(Dense(16, input_dim=latent_dim))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(32, input_dim=latent_dim))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(data_dim,activation='tanh'))
model.summary()
noise = Input(shape=(latent_dim,))
img = model(noise)
return Model(noise, img)
In [126]:
generator = build_generator(latent_dim=10,data_dim=29)
Model: "sequential_5"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_14 (Dense) (None, 16) 176
_________________________________________________________________
leaky_re_lu_5 (LeakyReLU) (None, 16) 0
_________________________________________________________________
batch_normalization_4 (Batch (None, 16) 64
_________________________________________________________________
dense_15 (Dense) (None, 32) 544
_________________________________________________________________
leaky_re_lu_6 (LeakyReLU) (None, 32) 0
_________________________________________________________________
batch_normalization_5 (Batch (None, 32) 128
_________________________________________________________________
dense_16 (Dense) (None, 29) 957
=================================================================
Total params: 1,869
Trainable params: 1,773
Non-trainable params: 96
_________________________________________________________________
In [0]:
def build_discriminator(data_dim,num_classes):
model = Sequential()
model.add(Dense(31,input_dim=data_dim))
model.add(LeakyReLU(alpha=0.2))
model.add(BatchNormalization(momentum=0.8))
model.add(Dropout(0.25))
model.add(Dense(16,input_dim=data_dim))
model.add(LeakyReLU(alpha=0.2))
model.summary()
img = Input(shape=(data_dim,))
features = model(img)
valid = Dense(1, activation="sigmoid")(features)
label = Dense(num_classes+1, activation="softmax")(features)
return Model(img, [valid, label])
In [128]:
discriminator = build_discriminator(data_dim=29,num_classes=2)
Model: "sequential_6"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_17 (Dense) (None, 31) 930
_________________________________________________________________
leaky_re_lu_7 (LeakyReLU) (None, 31) 0
_________________________________________________________________
batch_normalization_6 (Batch (None, 31) 124
_________________________________________________________________
dropout_2 (Dropout) (None, 31) 0
_________________________________________________________________
dense_18 (Dense) (None, 16) 512
_________________________________________________________________
leaky_re_lu_8 (LeakyReLU) (None, 16) 0
=================================================================
Total params: 1,566
Trainable params: 1,504
Non-trainable params: 62
_________________________________________________________________
In [0]:
optimizer = Adam(0.0002, 0.5)
discriminator.compile(loss=['binary_crossentropy', 'categorical_crossentropy'],
loss_weights=[0.5, 0.5],
optimizer=optimizer,
metrics=['accuracy'])
In [0]:
noise = Input(shape=(10,))
img = generator(noise)
discriminator.trainable = False
valid,_ = discriminator(img)
combined = Model(noise , valid)
combined.compile(loss=['binary_crossentropy'],
optimizer=optimizer)
In [131]:
X_train.shape
Out[131]:
(256326, 29)
In [0]:
from imblearn.under_sampling import RandomUnderSampler
In [0]:
rus = RandomUnderSampler(random_state=42)
In [0]:
X_res, y_res = rus.fit_sample(X, y)
In [135]:
X_res.shape
Out[135]:
(984, 29)
In [136]:
from sklearn.metrics import accuracy_score, f1_score
X_res -= X_res.min()
X_res /= X_res.max()
X_test -= X_test.min()
X_test /= X_test.max()
X_test_res, y_test_res = rus.fit_sample(X_test,y_test)
y_res.shape
Out[136]:
(984,)
In [0]:
def train(X_train,y_train,
X_test,y_test,
generator,discriminator,
combined,
num_classes,
epochs,
batch_size=128):
f1_progress = []
half_batch = int(batch_size / 2)
noise_until = epochs
# Class weights:
# To balance the difference in occurences of digit class labels.
# 50% of labels that the discriminator trains on are 'fake'.
# Weight = 1 / frequency
cw1 = {0: 1, 1: 1}
cw2 = {i: num_classes / half_batch for i in range(num_classes)}
cw2[num_classes] = 1 / half_batch
for epoch in range(epochs):
# ---------------------
# Train Discriminator
# ---------------------
# Select a random half batch of images
idx = np.random.randint(0, X_train.shape[0], half_batch)
imgs = X_train[idx]
# Sample noise and generate a half batch of new images
noise = np.random.normal(0, 1, (half_batch, 10))
gen_imgs = generator.predict(noise)
valid = np.ones((half_batch, 1))
fake = np.zeros((half_batch, 1))
labels = to_categorical(y_train[idx], num_classes=num_classes+1)
fake_labels = to_categorical(np.full((half_batch, 1), num_classes), num_classes=num_classes+1)
# Train the discriminator
d_loss_real = discriminator.train_on_batch(imgs, [valid, labels], class_weight=[cw1, cw2])
d_loss_fake = discriminator.train_on_batch(gen_imgs, [fake, fake_labels], class_weight=[cw1, cw2])
d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
# ---------------------
# Train Generator
# ---------------------
noise = np.random.normal(0, 1, (batch_size, 10))
validity = np.ones((batch_size, 1))
# Train the generator
g_loss = combined.train_on_batch(noise, validity, class_weight=[cw1, cw2])
# Plot the progress
print ("%d [D loss: %f, acc: %.2f%%, op_acc: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[3], 100*d_loss[4], g_loss))
if epoch % 10 == 0:
_,y_pred = discriminator.predict(X_test,batch_size=batch_size)
#print(y_pred.shape)
y_pred = np.argmax(y_pred[:,:-1],axis=1)
f1 = f1_score(y_test,y_pred)
print('Epoch: {}, F1: {:.5f}, F1P: {}'.format(epoch,f1,len(f1_progress)))
f1_progress.append(f1)
return f1_progress
In [0]:
f1_p = train(X_res,y_res,
X_test,y_test,
generator,discriminator,
combined,
num_classes=2,
epochs=10,
batch_size=128)
In [0]:
fig = plt.figure(figsize=(10,7))
plt.plot(f1_p)
plt.xlabel('10 Epochs')
plt.ylabel('F1 Score Validation')
57503/57503_1.ipynb
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "57503-1.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "jJjPXKyBmOI8",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "66bad2ef-ff82-4a67-97d6-0ff2d86fe75c"
},
"source": [
"!wget -O train.csv https://raw.githubusercontent.com/tiwari91/Housing-Prices/master/train.csv"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"--2020-05-11 16:41:37-- https://raw.githubusercontent.com/tiwari91/Housing-Prices/master/train.csv\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 460676 (450K) [text/plain]\n",
"Saving to: ‘train.csv’\n",
"\n",
"\rtrain.csv 0%[ ] 0 --.-KB/s \rtrain.csv 100%[===================>] 449.88K --.-KB/s in 0.06s \n",
"\n",
"2020-05-11 16:41:38 (7.11 MB/s) - ‘train.csv’ saved [460676/460676]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "CW5XtIbQnBiZ",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "61495621-c974-498a-bcd0-e53b8ced767a"
},
"source": [
"ls"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"\u001b[0m\u001b[01;34msample_data\u001b[0m/ train.csv\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "F9hfwkjKnco-",
"colab_type": "text"
},
"source": [
"#EDA"
]
},
{
"cell_type": "code",
"metadata": {
"id": "3AYb5FCynWCp",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 71
},
"outputId": "23eeb5d8-2d68-4563-be88-4f639f0b2df4"
},
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np\n",
"from scipy.stats import norm\n",
"from sklearn.preprocessing import StandardScaler\n",
"from scipy import stats\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"%matplotlib inline"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
" import pandas.util.testing as tm\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "kNLij4WDnfJt",
"colab_type": "code",
"colab": {}
},
"source": [
"df_train = pd.read_csv('train.csv')"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "_mXbLSwNnmV8",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 323
},
"outputId": "93a5a935-1531-4f20-c0a0-de57a970847e"
},
"source": [
"df_train.columns"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',\n",
" 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',\n",
" 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',\n",
" 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',\n",
" 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',\n",
" 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',\n",
" 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',\n",
" 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',\n",
" 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',\n",
" 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',\n",
" 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',\n",
" 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',\n",
" 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',\n",
" 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',\n",
" 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',\n",
" 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',\n",
" 'SaleCondition', 'SalePrice'],\n",
" dtype='object')"
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "m0jbURLqnnsm",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 170
},
"outputId": "be116276-ff8d-405e-ef48-026b37a20f78"
},
"source": [
"#descriptive statistics summary\n",
"df_train['SalePrice'].describe()"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"count 1460.000000\n",
"mean 180921.195890\n",
"std 79442.502883\n",
"min 34900.000000\n",
"25% 129975.000000\n",
"50% 163000.000000\n",
"75% 214000.000000\n",
"max 755000.000000\n",
"Name: SalePrice, dtype: float64"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "oH1MnZQ9nqd7",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 290
},
"outputId": "67a5b064-ac8c-4aa4-e36e-111060ab31e0"
},
"source": [
"#histogram\n",
"sns.distplot(df_train['SalePrice']);"
],
"execution_count": 7,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAERCAYAAACw4faYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deXxcZ33v8c9vZqTRvlmybNmWHTu2E8eJndhJ7EASEyCELXB74TZsBQp1C7QFulC4cEvh3vbeXtre0gKFsBdISBPCFpYkBUJYndiO49iJ90WLrcXWrtE6eu4f58iWFS0je2bOjOb7fr3m5ZkzZ875eWb01aPnPOc55pxDRESyQyjoAkREJHEKbRGRLKLQFhHJIgptEZEsotAWEckiCm0RkSySstA2sy+ZWZuZ7UvS9urN7BEze87MnjWzFcnYrohINkllS/srwB1J3N6/A59wzl0J3AC0JXHbIiJZIWWh7Zx7HOiYuMzMVpnZj81sl5n9wsyuSGRbZrYOiDjnHvW33eeciyW/ahGRzJbuPu27gT9xzm0C/gL4TIKvWwN0mdmDZvaUmX3CzMIpq1JEJENF0rUjMysBbgLuN7PxxVH/ud8BPj7Fy5qdcy/Dq/Nm4FqgAbgPeBvwxdRWLSKSWdIW2nit+i7n3MbJTzjnHgQenOG1TcAe59wxADP7DrAFhbaI5Ji0dY8453qA42b2egDzbEjw5U8CFWZW4z++DXg2BWWKiGS0VA75uxf4DbDWzJrM7B3Am4B3mNnTwH7gNYlsyzkXx+sD/4mZPQMY8PnUVC4ikrlMU7OKiGQPnREpIpJFUnIgsrq62q1YsSIVmxYRmZd27dp1xjlXM9t6KQntFStWsHPnzlRsWkRkXjKzk4msp+4REZEsotAWEckiCm0RkSyi0BYRySIKbRGRLKLQFhHJIgptEZEsotAWEckiCm0RkSySzvm0c8o9OxqmXP7GG+vTXImIzCdqaYuIZBGFtohIFkkotM3s/Wa238z2mdm9ZlaQ6sJEROT5Zg1tM1sC/Cmw2Tm3HggDd6W6MBEReb5Eu0ciQKGZRYAi4FTqShIRkenMGtrOuWbgH4AG4DTQ7Zx7ZPJ6ZrbdzHaa2c729vbkVyoiIrMP+TOzSrwL8F4GdAH3m9mbnXNfn7iec+5u4G6AzZs368KT05hqKKCGAYpIohLpHnkJcNw51+6cGwEeBG5KbVkiIjKVREK7AdhiZkVmZsCLgedSW5aIiEwlkT7tHcADwG7gGf81d6e4LhERmUJCp7E75z4KfDTFtYiIyCx0RqSISBZRaIuIZBGFtohIFlFoi4hkEYW2iEgWUWiLiGQRhbaISBZRaIuIZBFdIzJNDrf10tw5QCRkVBbnc1VdedAliUgWUminwWh8jHufaGBwZOzcsndvW8XSyqIAqxKRbKTukTQ43NbH4MgYb75xOR+84wryIyF+c/Rs0GWJSBZSaKfB001dFOaFWbuolLLCPK6rr2Rvczd9Q6NBlyYiWUahnWLDo2McON3L+iXlhEMGwNaVC4iPOZ443hFwdSKSbRTaKXawtZfh+BjXLD1/4LGmNMqa2hJ2HD9LfEwX+RGRxCm0U2xvUxel0QiXVRdfsHzrymp6B0fZf6o7oMpEJBsptFNocCTOwRavayRkdsFzq2tLKI1G2H+qJ6DqRCQbzRraZrbWzPZMuPWY2fvSUVy2O3m2n9Exx7q6suc9FzJjdW0pR9r6GI2PTfFqEZHnS+RyYwedcxudcxuBTUAM+HbKK5sHWnqGAKgrL5zy+TW1JQyMxNnT2JXOskQki821e+TFwFHn3MlUFDPftPYMUl6YR2F+eMrnVy8sxYDHDrantzARyVpzDe27gHunesLMtpvZTjPb2d6uEAJo6R6ktiw67fOF+WHqFxTx2KG2NFYlItks4dA2s3zgTuD+qZ53zt3tnNvsnNtcU1OTrPqyVnzM0d47xKKyghnXW1tbyr7mHtp6B9NUmYhks7m0tF8O7HbOtaaqmPnkTN8QceeonSW019SWAvD4oTPpKEtEstxcQvsNTNM1Is/X0uO1nBeVzxzai8sLqCmN8thBdZGIyOwSCm0zKwZeCjyY2nLmj9buQUIGNSXT92kDmBm3rqnhF4fPaOifiMwqodB2zvU75xY453T6XoJaegapLokSCc/+Fm9bW0P3wIiG/onIrDSfdoq09gwmPF/2zZfXEA4Zjx1sZ/OKqnPL79nR8Lx133hjfdJqFJHso9PYU6BvaJTO2Mis/dnjyovyuK6+QkP/RGRWCu0UONjSCzDrcL+Jtq1dqKF/IjIrhXYKjIf2bMP9Jrp1jTe2XUP/RGQmCu0UONjSQ34kREVRXsKvuaquTEP/RGRWCu0UOH42RnVJ/vOmY53J+NC/xw+1a+ifiExLoZ0CTR0xqopnHp89lRetXUjP4ChPaeifiExDoZ1k8TFHU+cAVXPoGhl385pq8sLGI/tbUlCZiMwHCu0ka+0ZZDg+RmVx/pxfW1aQx02rqnl4fyvO6dqRIvJ8Cu0ka+yIAVBVNPfQBnjZVYto6IhxsLU3mWWJyDyh0E6yhvHQvoiWNsBL19ViBg/v02SKIvJ8Cu0ka+yIETLvLMeLUVMaZVN9JQ+rX1tEpqDQTrLGzgEWlxcSCV38W3v7VbU8e7qHjv7hJFYmIvOBQjvJGjpiLKua+kK+iXrZVYsAeO50TzJKEpF5RKGdZA0dMeqrEpvdbzrLFxRzxaJS9p9SaIvIhRTaSTQwHKe9d4hlCU7JOpPbr1rEybP99A2NJqEyEZkvEr1yTYWZPWBmB8zsOTPbmurCslFTpzdypH7BpYf2y66qxQEH1EUiIhMk2tL+JPBj59wVwAbgudSVlL0a/dBedondIwDrFpdRUZSnLhIRucCsoW1m5cAtwBcBnHPDzjlNjjGFhrN+aCehe8TMuGpxGUfb+xgaiV/y9kRkfkjkcmOXAe3Al81sA7ALeK9zrn/iSma2HdgOUF+fm5fEaugYoDAvTHXJ3E6smeqyYgDr6sr51dGzHGrr4+ol5ckoUUSyXCLdIxHgOuDfnHPXAv3AByev5Jy72zm32Tm3uaamJsllZofGTm/kiM1hStaZLF9QRHF+mP2ndD1lEfEkEtpNQJNzbof/+AG8EJdJGpMwRnuikBlXLi7jYEsvo2OaY1tEEght51wL0Ghma/1FLwaeTWlVWcg5559Yc+n92RNdsaiUodExGjsGkrpdEclOifRpA/wJ8A0zyweOAW9PXUnZqTM2Qmw4ztIkHIScaGVNCSGDI229XFZdnNRti0j2SSi0nXN7gM0priVr3bOjgeZOryV8rL1v2gOLF6MgL8zSyiIOt/Xx0nVJ26yIZCmdEZkkXQPe5E4VFzmP9kxWLyyhuXOA2LDOjhTJdQrtJOmKjQBQUXhxU7LO5PKFJTjgaHv/rOuKyPym0E6SrtgweWGjKD+c9G0vrSwiGglxpE1XsxHJdQrtJOkaGKGiKD9pY7QnCoeMVTUlHG7r07UjRXKcQjtJumIjKekaGXf5whK6YiOc8E+VF5HcpNBOkq7YcEoOQo5bvbAEgF8ebk/ZPkQk8ym0k2B4dIz+4TiVF3ldyERUFedTWhBh18nOlO1DRDKfQjsJxof7laewe8TMWFZZxFONmmBRJJcptJOge3y4Xwq7RwDqq4o4eTbGmb6hlO5HRDKXQjsJxsdop7J7BDh37ck9DWpti+QqhXYSdA4MEzIoLUhtaC+pLCQSMnY3qF9bJFcptJOgOzZCWUEe4VDyx2hPlBcOsa6uTKEtksMU2knQGRuhIsVdI+OuXVbB3qZuRuOaX1skFym0k6BrILVjtCe6bnklseE4B1t1SrtILlJoX6L4mKNnILVnQ050XX0lALt1MFIkJym0L1FrzyBjDsrT1D2ytLKQ6pJ8nlK/tkhOSugiCGZ2AugF4sCoc04XRPA1d3kXP6hMU/eImXFtfaWG/YnkqLm0tF/knNuowL7QKT+009U9AnD1knKOnemnd3AkbfsUkcyg7pFL1ORfZixdByIB1i8pA+C50zoYKZJrEg1tBzxiZrvMbPtUK5jZdjPbaWY729tzZya6U10DFOWHyY+k7/ff+rpyAPY1d6dtnyKSGRJNmhc6564DXg68x8xumbyCc+5u59xm59zmmpqapBaZyZq7BtI2RnvcwrICakqj7Dul0BbJNQmFtnOu2f+3Dfg2cEMqi8omzZ0DVBSmr2tk3Pq6MvY396R9vyISrFlD28yKzax0/D5wO7Av1YVlA+ccpwJoaQOsX1LO4bZeBobjad+3iAQnkZZ2LfBLM3saeAL4gXPux6ktKzt0D4zQPxxP60HIcVfVlTPm4ECLWtsiuWTWcdrOuWPAhjTUknXOjRxJ43C/e3Y0ANAZ8y688OVfneC507288cb6tNUgIsHRkL9LcG6MdgDdIxWFeRTmhc/VICK5QaF9CZq70j9Ge5yZsaSikFPdCm2RXKLQvgTNnQMU5IUozg8Hsv+6igJau4cYHdM0rSK5QqF9CU51D1BXUYhZai9+MJ26ikLiztHWo2tGiuQKhfYlaO4cYElFYWD7r/P3rX5tkdyh0L4EzV2DgYZ2VXE+0UjoXN+6iMx/Cu2LNDgS50zfUKChHTJjcXmhWtoiOUShfZHGg7IuwNAGWFJRQEvPoK4ZKZIjFNoX6VTXIABLKoMN7bqKQkbijmNn+gOtQ0TSQ6F9kZq7YgCBdo/A+Za+pmkVyQ0K7YvU3DlAyGBReUGgdVSXRMkLG/s0459ITlBoX6TmrkFqywrICwf7FoZDxqKyAs2tLZIjFNoXqbkrFnjXyLi6ikKePdXD2JgLuhQRSTGF9kVq6hwI/CDkuCUVhfQNjXKyIxZ0KSKSYgrtizASH+NU1wD1VUVBlwLoYKRILlFoX4RTXQOMOViWIaG9sGz8YKRCW2S+Szi0zSxsZk+Z2UOpLCgbNPjdEJnS0o6EQly5uIw9jV1BlyIiKTaXlvZ7gedSVUg2ybTQBriuvpK9Td06M1JknksotM1sKfBK4AupLSc7NHYMkB8OUVsW7Bjtia5bXsnASJwDLb1BlyIiKZRoS/ufgQ8A0zbjzGy7me00s53t7e1JKS5TNXbEWFpZSDgUzDzaU9m0vBKAXSc7A65ERFJp1tA2s1cBbc65XTOt55y72zm32Tm3uaamJmkFZqKGjljGHIQcV1deQG1ZlN0NCm2R+SyRlvYLgDvN7ATwTeA2M/t6SqvKcA0dsYzqzwbvmpGblleqpS0yz80a2s65DznnljrnVgB3AT91zr055ZVlqO7YCN0DIxkX2uAdjGzqHKCtdzDoUkQkRTROe44aO72RI5nWPQJwbb3Xr737pIb+icxXcwpt59xjzrlXpaqYbJCJw/3GrV9SRn44pH5tkXlMLe05Gg/tZVWZMe/IRNFImPVLytitfm2ReUuhPUcNHTGqivMpLcgLupQpbVpeyd7mbgZH4kGXIiIpoNCeo8YMHO430U2rqhkeHeOJ4x1BlyIiKaDQnqNMHO430ZaVC8iPhHjs4Pw+wUkkVym052A0PkZz5wD1GdifPa4wP8yWlQt47FBb0KWISAootOfgdPcgo2Muo1vaANvW1HCsvZ9GXRRBZN5RaM/BybOZO0Z7om1rvWkEHjuo1rbIfKPQnoPjZ/oAWFVTEnAlM7usupj6qiL1a4vMQwrtOTja3k9xfpiFpdGgS5mRmbFtbQ2/PnpWQ/9E5hmF9hwcP9PPZTXFmGXOlKzT2ba2hoGRuIb+icwzCu05OHamj8uqM7trZNzWldUU54d5aO+poEsRkSRSaCdoaDROU+cAK6uLgy4lIYX5YV5x9WJ+sPc0seHRoMsRkSRRaCfo5NkYzsHKmuwIbYDXb15G/3CcHz7TEnQpIpIkkaALyBbH2r2RIysztHvknh0Nz1v2hhuWsWJBEffvbOR1m5YGUJWIJJta2gk6dqYfgBXVmT1GeyIz43WblrLjeAcNZ3Wijch8oJZ2go6197OwNJqxs/tN53euW8o/PnqIB3Y18me3rwWmbpW/8cb6dJcmIhdh1tA2swLgcSDqr/+Ac+6jqS4saJOD7YnjHVnVnz2urqKQm1fXcN/ORt5z2+VEI+GgSxKRS5BI98gQcJtzbgOwEbjDzLaktqzMc6ZvKGuG+032zhdeRmvPEN99SsP/RLLdrC1t55wD+vyHef7NpbKoTBMbGiU2HKc7Njxl10Kmu3l1NVfVlfHZx4/yX3VAUiSrJdSnbWZhYBdwOfBp59yOKdbZDmwHqK+fX/2jZ/qGAKguyezT1yeb+Avm6iXlfPPJRv7Hd/axfkl5gFWJyKVIaPSIcy7unNsILAVuMLP1U6xzt3Nus3Nuc01NTbLrDNSZvmEAqjN8zpGZrF9STlVxPo8fbsf740lEstFcr8beBfwMuCM15WSm9r4hQgaVRflBl3LRQmbcsrqGps4Bjrb3B12OiFykWUPbzGrMrMK/Xwi8FDiQ6sIySXvvEAuKo4RDmT9R1Eyura+grCDCTw9onm2RbJVIS3sx8DMz2ws8CTzqnHsotWVlltaeQRaWZW/XyLi8cIibV9dw4mw/x8+otS2SjWYNbefcXufctc65a5xz651zH09HYZliJD5GR/8wtWUFQZeSFNevqKI4GuFnuqqNSFbSaeyzaO8dwsG8Ce38SIibL6/mSFufriEpkoUU2rNo7RkEoDaLR45MduNlVRTmhdXaFslCCu1ZtPYMETZjQZaN0Z5JNC/MCy6v5kBLL6e6BoIuR0TmQKE9i7beQapL87N+5MhkN61aQEFeSK1tkSyj0J5Fa8/gvOnPnqggL8zWlQvYf6rnXBeQiGQ+hfYMhkbjdMZGWFg6/0Ib4AWrqsmPqLUtkk0U2jNo6/HmHFk0D8ZoT6UoGmHLZVU809TNCY3bFskKCu0ZtPV63QYL52H3yLgXXF5NOGR87vGjQZciIglQaM+gtWeISMioKs7eOUdmU1qQx6bllTywq4mWbvVti2Q6hfYMWnsGWVgaJWTza+TIZDevrmHMwRd+cSzoUkRkFgrtGczXkSOTVRXn8+prFnPPEw109g8HXY6IzEChPY3Y8Cg9g6Pzuj97ondtu5zYcJyv/uZE0KWIyAwU2tM47ffv1pXnRmivXVTKS66s5cu/OkH/0GjQ5YjINBTa0xgP7UU5EtoA737RKroHRrj3iey7DqZIrlBoT+N01wClBRFKC/KCLiVtrquvZMvKKj7/i2MMjcaDLkdEpqDQnsbp7kEW51Are9y7t11Oa88QD+5uDroUEZlCIpcbW2ZmPzOzZ81sv5m9Nx2FBWlwJE5b7yB15YVBl5J2N6+u5uol5XzmsSMMj44FXY6ITJJIS3sU+HPn3DpgC/AeM1uX2rKCdaStjzEHiytyL7TNjD+/fQ2NHQN8Y8fJoMsRkUkSudzYaefcbv9+L/AcsCTVhQXp2VM9ADnZPQJw65oablq1gH/96RF6B0eCLkdEJphTn7aZrQCuBXakophMsf9UN/mR0Lw+fX0mZsaHXn4lHf3DfO7nOktSJJMkHNpmVgJ8C3ifc65niue3m9lOM9vZ3t6ezBrT7tnTPSwuK5j3p6/P5Oql5bx6Qx1f+OUxzUkikkESCm0zy8ML7G845x6cah3n3N3Ouc3Ouc01NTXJrDGtxsYcz53uZXFFbnaNTPSBl63FOfifP3g26FJExJfI6BEDvgg855z7p9SXFKzGzhh9Q6MszsGRI5Mtqyrij190OT/Ye5rHD2X3X08i80UiLe0XAG8BbjOzPf7tFSmuKzD7c/wg5GTlhXksKM7n/fft4au/PsE9O3S2pEiQEhk98kvnnDnnrnHObfRvP0xHcUHY29RNXthyYna/RETCIe7cWMfZ/mEeP6zWtkjQIkEXkGmebuziysVl5IVz62TRmVrQqxeWcvWScn5+sJ2NSyvSWJWITJZbyTSL+Jhjb1MXG5cpmCZ75dWLCYeM7+89hXMu6HJEcpZCe4Kj7X30D8fZoNbk85QV5vGSK2s51NrHw/tbgi5HJGcptCfY09AFwAa1tKe0ZeUCFpcX8LHvP6s5t0UCotCeYE9TF6UFEVZWFwddSkYKh4zXbKjjdPcgn/zJ4aDLEclJCu0Jnm7sYsPSCkKh3D0Tcjb1C4q56/plfPGXxznY0ht0OSI5R6NHfAPDcQ609PKuW1cFXUrGu7ymhGgkxPav7WT7zSsx/3T/N95YH3BlIvOfWtq+/ae6iY859WcnoCga4Y6rFnHybIyn/OMAIpIeCm3fnsbxg5DlAVeSHa5bXkl9VRE/2nea2LAOSoqki0Lbt6exiyUVhSws1ZmQiQiZ8ZqNdQyMxHlkf2vQ5YjkDIU24Jxj18lONtara2QuFpcXsnXlAp480UFjRyzockRygkIbOHk2xunuQbasXBB0KVnnxVfWUloQ4bt7momP6UxJkVRTaAO/OXYWgK0K7TkryAvziqsXc6p7kK//VteUFEk1DfkDfnP0LDWlUVbV6KSai3H1knJ2nuzkHx4+yMuvXpTQcYGpJqjSkEGR2eV8S9s5x2+OnWXrygXnxhvL3JgZd26oY2h0jL/7wXNBlyMyr+V8aB9t76e9d4itq9Q1cimqS6L80bZVfGfPKX74zOmgyxGZt3I+tNWfnTzvedEqrq2v4C/uf5oDLc+79rOIJEEi14j8kpm1mdm+dBSUbr89epZFZQUsX1AUdClZLxoJ89k3b6IkGmH7v++iKzYcdEki804iLe2vAHekuI5AOOf47bGzbF2l/uxkqS0r4HNv2URL9yCv/+xvONyqSaVEkimRa0Q+DnSkoZa0O9DSy9n+YXWNJNm19ZV85e3X0xkb5s5P/Yqv//YkgyPxoMsSmReS1qdtZtvNbKeZ7Wxvz44LwD68vwUz2HZFTdClzDs3XV7ND//0ZjYsK+cj39nHDX/7n/z3bz/Dzw+1K8BFLkHSxmk75+4G7gbYvHlzVpwa9+N9LVy/vErzjaTIwrIC7nnnFn599Czf2t3Eg7ubuGdHA9FIiNW1pdy6uoYllYVBlymSVXL25Jqj7X0caOnlo69eF3Qp81ooZLxwdTUvXF3N3/2Xq9lx/CyPHWznm082sK+5m9ULS3jttUuoLMoPulSRrJCzof3jfd7Fae9YvyjgSnJHYX6YbWsXsm3tQuqrinjieAc/O9jGp356hN+9flnQ5YlkhVlD28zuBbYB1WbWBHzUOffFVBeWaj985jTX1lewuFx/nifLXE5NL8gLc8uaGtbVlXHPjga++usT1FcV8fsvvCzVZYpktVlD2zn3hnQUkk4NZ2PsP9XDh19xZdClzHtTBflE1SVR/ujWVfzHzkY+/tCzhAze9gIFt8h0crJ75If7vNOs1TWSGfIjId5wQz2PH27nb77/LJFwiDdvWR50WSIZKedOY4+POe7Z0cCm5ZUsq9JZkJkiHDI+9cZrue2KhXzkO/u478mZW+giuSrnQvuR/S00dMR4p/pOM040EuYzb7qOW9bU8MEHn+Fbu5qCLkkk4+RcaH/+F8eoryri9qvUNZKJCvLC3P2WTdy0agF/+cDTfHdPc9AliWSUnOrT3nWyg90NXXzszqsIh7y5RmY7UCbpM/GzeOmVizjVNcj779tDJBTildcsDrAykcyRUy3tzz9+nPLCPF6/eWnQpcgs8iMhfm/rcpZVFfHebz7FQ3tPBV2SSEbImZb2zhMdPLy/hVvX1vCdpxQA2SAaCfO2rSt46JnT/PE9T/HLw2f4yKvWURKd/mury5jJfJcTLe3BkTh/9a29lBflcesaTQ6VTaJ5Ye75gxt51zZvLPfL/t/jfPpnR2jqjE37mviYY8xlxfQ3InOWEy3tz/zsCEfb+3nbTSuIRsJBlyNzFI2E+as7ruDFVyzk7398gE88fJBPPHyQ6pJ8KovyKcgL0z80Su/QKF2xYUbijpBBaUEelUV5tPYMcuvaGjYurSAU0rzpkt3MpaBFsnnzZrdz586kb/di7Gvu5rWf/hV3bqhj84qqoMuRizC5e6OxI8b/+sFzdPQPExseZSQ+RjQSpiAvTEEkRH5eiHjc0TM4QlvvEM1dAzgH9VVFvOnGev7b5mVUFmuCKsksZrbLObd5tvXmdUu7sSPG27/yJNUlUf7Hq9bxI3+SKMluy6qK5tTN9fL1i3jsUBv3PtHI//7RAf7x0UO8+po63rJ1ORuWluuqRZJV5m1on+kb4i1f3MHw6Bj3/9FWtaxy2Pgv69duXMKWlQvYcews3997im/tbuLqJeW8ZctyXrVhMUX58/bHQeaRedk9cvJsP+/86k4aO2N8451b2LS8EtCYbDlvaCROJBLia785waHWPqKREC+8vJptVyxk49IK1iwq0fEPSauc7R75+aF2/vBrOzGMN924nIMtvRxs0cVl5ULRPC+Q37p1BSfOxtjX3M3uhk5+cqANgEjIWF1byvq6MtbVlbGmtpTVtSXUlETVnSKBmjct7e7YCJ945ADf2NFAbWkBb96ynCp1icgcOOfo6B/mVPcgp7oGzt36h89f07IwL0xtWZSFZQW88urFrK4tYU1tKdUl0QArl/kgZ1raA8Nx7t/VyCf/8zCdsWHeunUFKxYUkx/JiSHokkRmxoKSKAtKoly9pBzwgrx3aJS2niFaewZp6x2ktWeIvU1dPHG849xrq4rzWb3QC/DO2DDlhXmUF+ZRVphHSTSiqWYlaRIKbTO7A/gkEAa+4Jz7PymtahbOOZ5p7uaHz7TwzScb6IqNcP2KSv7mzhu4qq5cfdeSNGZGWUEeZQV5XL6w5Nxy5xwvvrKWQ629HGrt5XBrH4faevn2U830DY1esI2Qwb89dpTF5QUsKi+grqKQRWUFFzyuLomemw/nUgyOxOkdHOXeHQ2YeVPeRsIhIiHjTVvqyQ+Hcr57Z7p8yJYzZxO53FgY+DTwUqAJeNLMvuecezYVBTnnGIk7hkbjDI2OMTgS50zfMK09gxxp6+PZUz3sOtlJS88gIYOXXFnL9ltWsml5Zc5/GSV9zIxFfujeMmH4oXOOz//iON0DI/QMjNDt36qK8zndPcC+5m4efbaVodGxC7YXDhkLS6NUFuVTXphHRZF3K/ZHtIw5cDicg+H4GD0DI/QMjkWqd9EAAApdSURBVPr/jtAzMErP4AjDk7Y70ccfepZIyCjKD1McjXg3/35RfphIKEQo5P3fQmaE7PzZpfExR3yMc/fPL3PkhUPn/qoon3QrKxzfT4TiaJiSaIRoJEwohLc/Iyk/t2NjjqHRMWLDowyMxBkYjhMbjp97b3oHz79PO092MDgyxsBwnMGROA6IRkL8/FAbxfkRiqJhSgvyqPA/B+//kn/uMynKixAOG5GQEQ4ZYbO0nrSVSEv7BuCIc+4YgJl9E3gNkPTQ3vCxR+gZHGGmbvZlVYVsWl7Ji65YyG1XLFS/tQRmuhZbSTRCSTTCkoqprz/qnGNgOE734PlQ90J+lIHhUUbHxjjS1kfXwAj9Q6MYXrAZgHkBU1aQR2lhHrGhUYrzIywojlKYF/JOMMoLE/W7B+NjjlH/dlVdGbHhUfqH4vQPjRIbjtM3NMqJs/0Mj44RH3M4wDkoLYgw5ty58PYC3egeGCbk1xIyw8yIj40xMDLGwEicweE48TkeJwv5fxGE/F8W47+czm1lwi8s579//uIZs2K6fUUjYQrzwxTkhSiIhDHz/kI5cSZGbMR7f3oGRhgdS3zj4+9RTWmUX33wtrkVNUeJhPYSoHHC4ybgxskrmdl2YLv/sM/MDl56eQBUA2fGH5wEfgl8Jkkbv0gX1JRBMrGuTKwJMrOuTKwJMrOuTKyJw1BtH7rouhI68JG0A5HOubuBu5O1vXFmtjORI6rplIk1QWbWlYk1QWbWlYk1QWbWlYk1QXrqSmSIRTOwbMLjpf4yERFJs0RC+0lgtZldZmb5wF3A91JbloiITGXW7hHn3KiZ/THwMN6Qvy855/anvLLzkt7lkgSZWBNkZl2ZWBNkZl2ZWBNkZl2ZWBOkoa6UnBEpIiKpodMGRUSyiEJbRCSbOOcy8gbcARwEjgAfTOJ2vwS0AfsmLKsCHgUO+/9W+ssN+Be/hr3AdRNe81Z//cPAWycs3wQ847/mXzjfBTXlPvznlgE/wzthaT/w3qDrAgqAJ4Cn/Zo+5i+/DNjhb+c+IN9fHvUfH/GfXzFh3x/ylx8EXjbbZzzdPiZ9jmHgKeChTKgLOOG/v3uAnUF/fhNeVwE8ABwAngO2BlkXsNZ/j8ZvPcD7MuS9ej/ed30fcC/ez0BGfN8vqDMdAXwRwRoGjgIrgXy84FiXpG3fAlzHhaH9f8ffROCDwN/7918B/Mj/4mwBdkz48I/5/1b698e/ZE/465r/2pfPtA//8eLxLyNQChwC1gVZl79eiX8/z/9SbQH+A7jLX/5Z4F3+/XcDn/Xv3wXc599f539+Uf/LedT/fKf9jKfbx6TP8c+Aezgf2oHWhRfa1ZNqDPR75S/7KvBO/34+XogHXteEn/MWvJNKgv4ZXAIcBwonfNZvm+4zJ83f9wvet3QHcoLBuhV4eMLjDwEfSuL2V3BhaB8EFvv3FwMH/fufA94weT3gDcDnJiz/nL9sMXBgwvJz6023j2nq+y7eXC8ZURdQBOzGOxP2DBCZ/DnhjS7a6t+P+OvZ5M9ufL3pPmP/NVPuY8K6S4GfALcBD830mnTVxdShHejnB5TjBZFlUl0T1r8d+FUm1MT5M7+r/O/JQ8DLpvvMSeP3ffItU/u0pzp1fkkK91frnDvt328BamepY6blTVMsn2kfFzCzFcC1eC3bQOsys7CZ7cHrTnoUr6XQ5ZwbnWI75/btP98NLLiIWhfMsI9x/wx8ABifHWmm16SrLgc8Yma7/CkdIPjv1WVAO/BlM3vKzL5gZsUZUNe4u/C6IWZaPy01OeeagX8AGoDTeN+TXQT/vXqeTA3twDjv150LYh9mVgJ8C3ifc64n6Lqcc3Hn3Ea8lu0NwBWp3H8izOxVQJtzblfQtUzyQufcdcDLgfeY2S0TnwzoexXB6wr8N+fctUA/XrdA0HXhn6h3J3B/IuunuiYzq8SbCO8yoA4oxuuDzjiZGtrpPnW+1cwWA/j/ts1Sx0zLl06xfKZ94C/LwwvsbzjnHsyUugCcc114B0q3AhVmFpliO+f27T9fDpy9iFrPzrAPgBcAd5rZCeCbeF0knwy6Lr+lhnOuDfg23i+5oD+/JqDJObfDf/wAXogHXRd4v9x2O+daZ1k/XTW9BDjunGt3zo0AD+J914L+vj9PpoZ2uk+d/x7ekWj8f787YfnvmWcL0O3/efUwcLuZVfq/oW/H64c6DfSY2RbzJgn+vUnbmmof+Ot+EXjOOfdPmVCXmdWYWYV/vxCvj/05vPB+3TQ1jW/ndcBP/dbM94C7zCxqZpcBq/EOFE35GfuvmW4fOOc+5Jxb6pxb4b/mp865NwVZl5kVm1np+H3/fd8X5Ofnv1ctQKOZrfUXvRhvhFKgdfnewPmukZnWT1dNDcAWMyvyXzf+XgX6fZ/STB3eQd7wjhofwutH/XASt3svXp/VCF5L5B14/Uo/wRsK9J9Alb+u4V0A4ijeEKLNE7bz+3hDdI4Ab5+wfDPeD+xR4FOcH2405T78516I96faXs4PhXpFkHUB1+ANqdvrv+6v/eUr/S/hEbw/baP+8gL/8RH/+ZUT9v1hf78H8Y/kz/QZT7ePKT7LbZwfPRJYXf7ypzk/PPLDs7y3afle+c9vBHb6n+N38EZaBP19L8ZrYZZPWJYJ79XH8IZG7gO+hjcCJGO+7+M3ncYuIpJFMrV7REREpqDQFhHJIgptEZEsotAWEckiCm0RkSyi0JaMZWYfNrP9ZrbXzPaY2Y0zrPsVM3vddM9PWOe4v63dZrZ1mvU+bmYvudT6RVIhaVdjF0kmP1BfhTf74ZCZVePNjnap/tI594CZ3Y43ydA1k/Ybds79dRL2I5ISamlLploMnHHODQE45844506Z2V+b2ZNmts/M7vbPXruAmW0ys5+bN3nTw+OnLk/yOHC5v/4JM/t7M9sNvH5iq93MrjezX5vZ02b2hJmVmjeR1if8Ovaa2R+m7m0QuZBCWzLVI8AyMztkZp8xs1v95Z9yzl3vnFsPFOK1xs8xbw6XfwVe55zbhHfRi7+dYvuvxjvDbtxZ59x1zrlvTthWPt6k9O91zm3Am59iAO8s2m7n3PXA9cAf+Kcsi6ScukckIznn+sxsE3Az8CLgPjP7INBrZh/Am+O7Cu+08e9PeOlaYD3wqN8ID+NNWzDuE2b2EbwpS98xYfl9U5SxFjjtnHvSr6kHwO9auWZCH3o53hwTxy/+fyySGIW2ZCznXBx4DHjMzJ4B/hCvD3qzc67RzP4Gbw6IiQzY75yb8iAjfp/2FMv751CaAX/inHt4Dq8RSQp1j0hGMrO1ZrZ6wqKNeBPwAJwxb+7xqUaLHARqxkeGmFmemV11kWUcBBab2fX+tkrNm0LzYeBdflcMZrbGn91PJOXU0pZMVQL8qz897CjeDGjbgS68Wdha8Ka7vIBzbtjvtvgXMyvH+47/M143ypz42/pdv45CvP7slwBfwLtk3W7/QGg78No5/w9FLoJm+RMRySLqHhERySIKbRGRLKLQFhHJIgptEZEsotAWEckiCm0RkSyi0BYRySL/H2LFk5+pt1VcAAAAAElFTkSuQmCC\n",
"text/plain": [
"