I have all of it done, I just need # 9 and # 11 on the assignment.
import numpy as npimport pandas as pd
pd.options.display.max_colwidth = 1000 # this allows us to see more columnspd.options.display.max_rows = 1000 # this allows us to see more rows
dataset = pd.read_csv ("train.csv.zip", nrows = 1000 )dataset
#dataset["AVProductsInstalled"]
#The sum of the missing values in each variable#Panda_dataframe.isnull().sum()dataset.isnull().sum()
#The sum of the missing values in each variable#dataset.isnull().sum()/len(dataset)(dataset.isnull().sum()/1000)*100
#Total of missing values in all variablesdataset.isnull().sum().sum()
dataset.dtypes
# produce summary statistics for numerical attributes# produce value counts for categorical attributesdataset["HasDetections"] = dataset["HasDetections"].astype("object")for var_name in dataset.columns: print("-"*50) print(var_name) if dataset[var_name].dtype == np.object: print(dataset[var_name].value_counts()) if dataset[var_name].dtype == np.int64 or dataset[var_name].dtype == np.float64: print(dataset[var_name].describe()) #Missing values imputationfrom sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
def __init__(self): """Impute missing values.
Columns of dtype object are imputed with the most frequent value in column.
Columns of other types are imputed with mean of column.
""" def fit(self, X, y=None):
self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].mean() for c in X], index=X.columns)
return self
def transform(self, X, y=None): return X.fillna(self.fill) dataset = DataFrameImputer().fit_transform(dataset)
#Panda_dataframe .isnull().sum().sum()dataset.isnull().sum().sum()
# encode string values to integersfrom sklearn.preprocessing import LabelEncoder
# maintain a dict for string to integer mappings for each columnlabel_dict = dict()#for var_name in var_names:for var_name in dataset: # only map string values if dataset[var_name].dtype == np.object: le = LabelEncoder() # map the string values dataset[var_name] = le.fit_transform(dataset[var_name]) # store the mapping in the dict label_dict[var_name] = dict() for cls, label in zip(le.classes_, le.transform(le.classes_)): label_dict[var_name][label] = clslabel_dict
#Split the data into training and testing sets (80-20)from sklearn.model_selection import train_test_splitattributes = [col for col in dataset if col != "HasDetections"]train_x, test_x, train_y, test_y = train_test_split(dataset[attributes], dataset["HasDetections"], test_size=0.2, random_state=123)
print ("shapes")print (train_x.shape)print (test_x.shape)print (train_y.shape)print (test_y.shape)
print
print ("class counts")print (dataset["HasDetections"].value_counts())print (train_y.value_counts())print (test_y.value_counts())# the class counts show that the sampling is roughly stratified
from sklearn import treeclf = tree.DecisionTreeClassifier()# train modelclf = clf.fit(train_x, train_y)# make predictionpred_y = clf.predict(test_x)#pred_y=[1,2,2,1,..........,2]# evaluate the prediction results
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print ("f1:" + str(f1_score(pred_y, test_y)))print ("accuracy:" + str(accuracy_score(pred_y, test_y)))print ("precision:" + str(precision_score(pred_y, test_y)))print ("recall:" + str(recall_score(pred_y, test_y)))
from sklearn.svm import LinearSVCclf = LinearSVC(random_state=123456)# train/build modelclf = clf.fit(train_x, train_y)# make predictionpred_y = clf.predict(test_x)# evaluate the prediction resultsprint ("f1:" + str(f1_score(pred_y, test_y)))print ("accuracy:" + str(accuracy_score(pred_y, test_y)))print ("precision:" + str(precision_score(pred_y, test_y)))print ("recall:" + str(recall_score(pred_y, test_y)))
from sklearn.naive_bayes import MultinomialNBclf = MultinomialNB()# train modelclf = clf.fit(train_x, train_y)# make predictionpred_y = clf.predict(test_x)# evaluate the prediction resultsprint ("f1:" + str(f1_score(pred_y, test_y)))print ("accuracy:" + str(accuracy_score(pred_y, test_y)))print ("precision:" + str(precision_score(pred_y, test_y)))print ("recall:" + str(recall_score(pred_y, test_y)))