Reading data
data.train = read.csv('data_training.csv')
data.grid = read.csv('data_grid.csv')
# convert y to factor
data.train$y = factor(data.train$y)
# set seed
set.seed(270216)
logistic regression model up to 5th order polynomial
lr1 <- glm(y ~ x1 + x2, data=data.train, family=binomial())
lr2 <- glm(y ~ polym(x1, x2, degree = 2, raw = TRUE), data=data.train, family=binomial())
lr3 <- glm(y ~ polym(x1, x2, degree = 3, raw = TRUE), data=data.train, family=binomial())
lr4 <- glm(y ~ polym(x1, x2, degree = 4, raw = TRUE), data=data.train, family=binomial())
lr5 <- glm(y ~ polym(x1, x2, degree = 5, raw = TRUE), data=data.train, family=binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
5 folds cv with cuttoff probability P(y=1|x1,x2) = 0.5
library('boot')
cv1 = cv.glm(data=data.train, glmfit=lr1, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
cv2 = cv.glm(data=data.train, glmfit=lr2, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
cv3 = cv.glm(data=data.train, glmfit=lr3, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
cv4 = cv.glm(data=data.train, glmfit=lr4, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv5 = cv.glm(data=data.train, glmfit=lr5, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv1$delta
## [1] 0.285 0.283
cv2$delta
## [1] 0.295 0.303
cv3$delta
## [1] 0.290 0.299
cv4$delta # best
## [1] 0.235 0.225
cv5$delta
## [1] 0.275 0.262
prediction on grid data
data.grid$prob.lr4 <- predict(lr4, newdata=data.grid[1:2], type="response")
plot the prediction and data points
p0 <- ggplot() + geom_point(data= data.train, aes(x=x1, y=x2, color=y), size=2) + scale_color_manual(values=c("green", "red"))
# plot the decision boundary of the logistic regression prediction (a contour plot with a cut at the level 0.5)
p.lr4 <- p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.lr4), breaks=c(0.5))
p.lr4
LASSO up to 5th order polynomial
library('glmnet')
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
# preparing the data
x.las = as.matrix(data.train[1:2])
y.las = data.train$y
xtest.las = as.matrix(data.grid[1:2])
10 folds cross validation of lasso models using AUC
cvfit1 = cv.glmnet(x.las, y.las, family = "binomial", type.measure = "auc")
cvfit2 = cv.glmnet(poly(x.las,degree = 2, raw = TRUE), y.las, family = "binomial", type.measure = "auc")
cvfit3 = cv.glmnet(poly(x.las,degree = 3, raw = TRUE), y.las, family = "binomial", type.measure = "auc")
cvfit4 = cv.glmnet(poly(x.las,degree = 4, raw = TRUE), y.las, family = "binomial", type.measure = "auc")
cvfit5 = cv.glmnet(poly(x.las,degree = 5, raw = TRUE), y.las, family = "binomial", type.measure = "auc")
max(cvfit1$cvm)
## [1] 0.8046904
max(cvfit2$cvm)
## [1] 0.8417601
max(cvfit3$cvm)
## [1] 0.8221147
max(cvfit4$cvm)
## [1] 0.8310342
max(cvfit5$cvm) # best
## [1] 0.845408
predictions
data.grid$prob.las.lr5 = predict(cvfit5, newx = poly(xtest.las,degree = 5, raw = TRUE), s='lambda.min', type = 'response')
data.grid$prob.las.lr5 = as.vector(data.grid$prob.las.lr5)
plot Lasso with degree 5 polynomial
p.las.lr5 <- p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.las.lr5), breaks=c(0.5))
p.las.lr5
TREE
library('tree')
tree1 = tree(y ~ x1 + x2, data = data.train)
pruning by cross validation
tree1.cv = cv.tree(tree1)
# optimal tree size obtained by CV
optimal <- which.min(tree1.cv$dev)
optimal.k <- tree1.cv$k[optimal]
optimal.size <- tree1.cv$size[optimal]
# the final pruned tree
tree1.pruned <- prune.tree(tree1, best=optimal.size)
predictions
p = predict(tree1.pruned, newdata = data.grid[,1:2], type = "vector")
data.grid$prob.tree.pruned = p[,2]
plot pruned tree
p.tree.pruned <- p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.tree.pruned), breaks=c(0.5))
p.tree.pruned
Random Forest
library('ranger')
# tune random forest (mtry) manually
mse.rfs <- rep(0, 2)
for(m in 1:2){
rf <- ranger(y ~ ., data=data.train, mtry=m, probability = TRUE)
mse.rfs[m] <- rf$prediction.error
}
plot(1:2, mse.rfs, type="b", xlab="mtry", ylab="OOB Error")
mse.rfs # 1 is the best
## [1] 0.1452021 0.1466274
predictions
rf <- ranger(y ~ ., data=data.train, mtry=1, probability = TRUE)
p = predictions(predict(rf, data.grid[,1:2]))
data.grid$prob.rf = p[,2]
plot rf
p.rf <- p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.rf), breaks=c(0.5))
p.rf
Boosting Classification
library(xgboost)
#setting up xbg.DMatrix
dtrain <- xgb.DMatrix(data = as.matrix(data.train[1:2]), label = as.matrix(data.train$y))
parameter tuning 100 rounds, random selection of parameters
best.param = list()
best.cv = Inf
best.cv.index = 0
best.nround = 0
for (iter in 1:100) {
param <- list(objective = "binary:logistic",
max_depth = sample(1:10, 1),
eta = runif(1, 0.001, 0.05),
subsample = runif(1, .5, .9),
colsample_bytree = runif(1, .5, .9)
)
cv.nround = sample(1000:10000,1)
mdcv <- xgb.cv(data=dtrain, params = param, nthread=16,
nfold=10, nrounds=cv.nround,
verbose = FALSE, early_stopping_rounds=100)
min.cv = min(mdcv$evaluation_log$test_error_mean)
min.cv.index = which.min(mdcv$evaluation_log$test_error_mean)
if (min.cv < best.cv) {
best.cv = min.cv
best.cv.index = min.cv.index
best.param = param
}
}
# best model
nround = best.cv.index
best.boost <- xgboost(data=dtrain, params=best.param, nrounds=nround, nthread=16, verbose = FALSE)
predictions
data.grid$prob.boost = predict(best.boost, as.matrix(data.grid[1:2]))
plot of gbm
p.boost = p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.boost), breaks=c(0.5))
p.boost
SVM wtih RBF Kernal(default)
library("e1071")
# tuning with 10 fold cross validation
obj <- tune.svm(y~., data = data.train, gamma = 2^(-1:1), cost = 2^(0:10), probability = TRUE)
obj
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 2 2
##
## - best performance: 0.16
# best svm
svm.best = svm(y ~ ., data = data.train, gamma = 2, cost = 2, probability = TRUE)
predictions
p = predict(svm.best, newdata = data.grid[1:2], probability = TRUE)
data.grid$prob.svm = attr(p,which = 'probabilities')[,2]
plot svm
p.svm = p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.svm), breaks=c(0.5))
p.svm
neural net
library('caret')
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
# parameter fitting and best model
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 10)
nn <- train(y ~ ., data = data.train, method = 'nnet', trControl = fitControl, trace = FALSE)
nn
## Neural Network
##
## 200 samples
## 2 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0e+00 0.7175 0.435
## 1 1e-04 0.7190 0.438
## 1 1e-01 0.7155 0.431
## 3 0e+00 0.7300 0.460
## 3 1e-04 0.7355 0.471
## 3 1e-01 0.7090 0.418
## 5 0e+00 0.7610 0.522
## 5 1e-04 0.7550 0.510
## 5 1e-01 0.7205 0.441
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.
predictions
data.grid$prob.nn = predict(nn, newdata = data.grid[1:2], type = 'prob')$'1'
plot nn
p.nn = p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.nn), breaks=c(0.5))
p.nn