Answer To: Malware Identification – Supervised Learning (45%) Page 1 of 6 Creating a model to detect malware...
Naveen answered on May 12 2021
# install required packages
install.packages("glmnet")
install.packages("caret")
install.packages("dplyr")
install.packages("party")
install.packages("ipred")
# load required packages
library(glmnet)
library(caret)
library(dplyr)
library(party)
library(ipred)
# removing all object in our working directory
rm(list = ls())
# -------------------- Part 1 ----------------------------------------
# Import the dataset MalwareSamples10000.csv
malware <- read.csv('malwaresamples10000.csv')
# print first SIX records
head(malware)
# Print dimension of the data
dim(malware)
# print structure of the data
print(str(malware))
# convert categorical data to numeric data
malware$senderDomainSuffix <- as.numeric(ifelse(malware$senderDomainSuffix==".in",0,
ifelse(malware$senderDomainSuffix== "co.uk",1,
ifelse(malware$senderDomainSuffix== "com",2,
ifelse(malware$senderDomainSuffix== "com.au", 3,
ifelse(malware$senderDomainSuffix== "edu.au", 4,
ifelse(malware$senderDomainSuffix== "net", 5,
ifelse(malware$senderDomainSuffix== "net.au",6,7))))))))
name <- c('isMalware', 'hasExe', 'hasZip', 'hasPDF', 'hasDoc', 'hasUnknown', 'hasURL')
for (i in name)
{
malware[[i]]=as.numeric(ifelse(malware[i]=='Yes',1,ifelse(malware[i]=='No',2,malware[i])))
}
# remove first column as specimenId
malware <- malware[,-1,drop=FALSE]
# print first SIX records
head(malware)
# Set the random seed using student ID
set.seed(10460276)
samples <- sample(1:nrow(malware), size = round(nrow(malware)*80/100))
# -------------------- Part 2 ----------------------------------------
# Set the random seed using student ID
set.seed(10460276)
models.list1 <- c("Logistic Ridge Regression", "Logistic LASSO Regression",
"Logistic Elastic-Net Regression")
models.list2 <- c("Classification Tree", "Bagging Tree", "Random Forest")
myModels <- c("Binary Logistic Regression",
sample(models.list1,size=1),
sample(models.list2,size=1))
myModels %>% data.frame
# Splitting the data into train and test
train <- malware[samples,,drop=FALSE]
test <- malware[-samples,,drop=FALSE]
x_test <- test[,-ncol(test),drop=FALSE]
y_test <- test[,ncol(test),drop=FALSE]
# -------------------- Binary Logistic Regression ----------------------------------------
# Build Binary Logistic Regression model
glm_model <- glm(isMalware~.,data=train)
# print summary of the model
summary(glm_model)
# predict using the test data
glm_pred <- glm_model %>% predict(test[,-ncol(test),drop=FALSE]) %>% as.vector() %>% round()
# model performance metrics
act <- test$isMalware
glm_performance <- data.frame(
MSE = mean((glm_pred-act)^2),
RMSE = RMSE(glm_pred, act),
Rsquare = R2(glm_pred, act)
)
# print model performance metrics
print(glm_performance)
# Binary Logistic Regression model Confussion matrix
glm_CM <- confusionMatrix(data=factor(glm_pred,labels = c('Yes','No')), reference = factor(act,labels = c('Yes','No')))
# print confussion matrix
print(glm_CM)
# Converting numeric to factor level of dependent variable "isMalware"
train$isMalware <- factor(ifelse(train$isMalware==1,1,2))
# define the control using a logistic regression...