Answer To: Introduction For this week’s take-home lab, you will work on the same data set from Week 4/5...
Mohd answered on Feb 19 2022
-
-
-
2/18/2022
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.1.1
## Loading required package: ggplot2
## Loading required package: lattice
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.1
library(ggplot2)
library(magrittr)
library(rmarkdown)
library(readxl)
ucicreditcard <- read_excel("~/New folder (2)/ucicreditcard.xlsx")
#View(ucicreditcard)
ucicreditcard$default_payment<-ucicreditcard$`default payment next month`
ucicreditcard<-ucicreditcard[,-25]
#Assigning values to NA
ucicreditcard$MARRIAGE<-replace(ucicreditcard$MARRIAGE,ucicreditcard$MARRIAGE==0,NA)
ucicreditcard%>%
count(EDUCATION)
## # A tibble: 7 x 2
## EDUCATION n
##
## 1 0 14
## 2 1 10585
## 3 2 14030
## 4 3 4917
## 5 4 123
## 6 5 280
## 7 6 51
ucicreditcard$EDUCATION<-replace(ucicreditcard$EDUCATION,ucicreditcard$EDUCATION==6,NA)
ucicreditcard$EDUCATION<-replace(ucicreditcard$EDUCATION,ucicreditcard$EDUCATION==5,NA)
ucicreditcard%>%
count(PAY_0)
## # A tibble: 11 x 2
## PAY_0 n
##
## 1 -2 2759
## 2 -1 5686
## 3 0 14737
## 4 1 3688
## 5 2 2667
## 6 3 322
## 7 4 76
## 8 5 26
## 9 6 11
## 10 7 9
## 11 8 19
sum(is.na(ucicreditcard$PAY_0))
## [1] 0
ucicreditcard$PAY_0<-replace(ucicreditcard$PAY_0,ucicreditcard$PAY_0==-2,NA)
ucicreditcard$PAY_2<-replace(ucicreditcard$PAY_2,ucicreditcard$PAY_2==-2,NA)
ucicreditcard$PAY_3<-replace(ucicreditcard$PAY_3,ucicreditcard$PAY_3==-2,NA)
ucicreditcard$PAY_4<-replace(ucicreditcard$PAY_4,ucicreditcard$PAY_4==-2,NA)
ucicreditcard$PAY_5<-replace(ucicreditcard$PAY_5,ucicreditcard$PAY_5==-2,NA)
ucicreditcard$PAY_6<-replace(ucicreditcard$PAY_6,ucicreditcard$PAY_6==-2,NA)
ucicreditcard%>%
count(PAY_0)
## # A tibble: 11 x 2
## PAY_0 n
##
## 1 -1 5686
## 2 0 14737
## 3 1 3688
## 4 2 2667
## 5 3 322
## 6 4 76
## 7 5 26
## 8 6 11
## 9 7 9
## 10 8 19
## 11 NA 2759
sum(is.na(ucicreditcard$PAY_0))
## [1] 2759
Training/test split of the data, building a 20% held out test dataset
#removing NA
ucicreditcard<-na.omit(ucicreditcard)
set.seed(2223)
ucicreditcard<-ucicreditcard[,2:25]
inp <- sample(2, nrow(ucicreditcard), replace = TRUE, prob = c(0.8, 0.2))
training_data <- ucicreditcard[inp==1, ]
test_data <- ucicreditcard[inp==2, ]
Fit the best KNN model and CART model you can (consider feature selection etc.) to the data to predict consumer default.
train.respo<-training_data$default_payment
test.respo<-test_data$default_payment
train.explano<-training_data[,2:24]
test.explano<-test_data[,2:24]
KNN Model with Summary
library(class)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.2
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.2
knn.1<-knn(train.explano,test.explano,train.respo,k=1)
knn.5<-knn(train.explano,test.explano,train.respo,k=5)
knn.10<-knn(train.explano,test.explano,train.respo,k=10)
knn.30<-knn(train.explano,test.explano,train.respo,k=30)
knn.15<-knn(train.explano,test.explano,train.respo,k=15)
sum(test.respo==knn.1)/length(test.respo)
## [1] 0.6909478
sum(test.respo==knn.5)/length(test.respo)
## [1] 0.7512247
sum(test.respo==knn.10)/length(test.respo)
## [1] 0.7635783
sum(test.respo==knn.30)/length(test.respo)
## [1] 0.770394
sum(test.respo==knn.15)/length(test.respo)
## [1] 0.7682641
class(train.respo)
## [1]...