Reading data

data.train = read.csv('data_training.csv')
data.grid = read.csv('data_grid.csv')

# convert y to factor
data.train$y = factor(data.train$y)

# set seed
set.seed(270216)

logistic regression model up to 5th order polynomial

lr1 <- glm(y ~ x1 + x2, data=data.train, family=binomial())
lr2 <- glm(y ~ polym(x1, x2, degree = 2, raw = TRUE), data=data.train, family=binomial())
lr3 <- glm(y ~ polym(x1, x2, degree = 3, raw = TRUE), data=data.train, family=binomial())
lr4 <- glm(y ~ polym(x1, x2, degree = 4, raw = TRUE), data=data.train, family=binomial())
lr5 <- glm(y ~ polym(x1, x2, degree = 5, raw = TRUE), data=data.train, family=binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

5 folds cv with cuttoff probability P(y=1|x1,x2) = 0.5

library('boot')
cv1 = cv.glm(data=data.train, glmfit=lr1, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
cv2 = cv.glm(data=data.train, glmfit=lr2, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
cv3 = cv.glm(data=data.train, glmfit=lr3, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
cv4 = cv.glm(data=data.train, glmfit=lr4, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv5 = cv.glm(data=data.train, glmfit=lr5, cost <- function(r, pi = 0) mean(abs(r-pi) > 0.5), K = 5)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv1$delta
## [1] 0.285 0.283
cv2$delta
## [1] 0.295 0.303
cv3$delta
## [1] 0.290 0.299
cv4$delta # best
## [1] 0.235 0.225
cv5$delta 
## [1] 0.275 0.262

prediction on grid data

data.grid$prob.lr4 <- predict(lr4, newdata=data.grid[1:2], type="response")

plot the prediction and data points

p0 <- ggplot() + geom_point(data= data.train, aes(x=x1, y=x2, color=y), size=2) + scale_color_manual(values=c("green", "red"))

# plot the decision boundary of the logistic regression prediction (a contour plot with a cut at the level 0.5)
p.lr4 <- p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.lr4), breaks=c(0.5)) 

p.lr4

LASSO up to 5th order polynomial

library('glmnet')
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-13
# preparing the data
x.las = as.matrix(data.train[1:2])
y.las = data.train$y
xtest.las = as.matrix(data.grid[1:2])

10 folds cross validation of lasso models using AUC

cvfit1 = cv.glmnet(x.las, y.las, family = "binomial", type.measure = "auc")
cvfit2 = cv.glmnet(poly(x.las,degree = 2, raw = TRUE), y.las, family = "binomial", type.measure = "auc")
cvfit3 = cv.glmnet(poly(x.las,degree = 3, raw = TRUE), y.las, family = "binomial", type.measure = "auc")
cvfit4 = cv.glmnet(poly(x.las,degree = 4, raw = TRUE), y.las, family = "binomial", type.measure = "auc")
cvfit5 = cv.glmnet(poly(x.las,degree = 5, raw = TRUE), y.las, family = "binomial", type.measure = "auc")

max(cvfit1$cvm)
## [1] 0.8046904
max(cvfit2$cvm)
## [1] 0.8417601
max(cvfit3$cvm)
## [1] 0.8221147
max(cvfit4$cvm) 
## [1] 0.8310342
max(cvfit5$cvm) # best
## [1] 0.845408

predictions

data.grid$prob.las.lr5 = predict(cvfit5, newx = poly(xtest.las,degree = 5, raw = TRUE), s='lambda.min', type = 'response')
data.grid$prob.las.lr5 = as.vector(data.grid$prob.las.lr5)

plot Lasso with degree 5 polynomial

p.las.lr5 <- p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.las.lr5), breaks=c(0.5)) 
p.las.lr5

TREE

library('tree')

tree1 = tree(y ~ x1 + x2, data = data.train)

pruning by cross validation

tree1.cv = cv.tree(tree1)

# optimal tree size obtained by CV
optimal <- which.min(tree1.cv$dev)
optimal.k <- tree1.cv$k[optimal]
optimal.size <- tree1.cv$size[optimal]

# the final pruned tree
tree1.pruned <- prune.tree(tree1, best=optimal.size)

predictions

p = predict(tree1.pruned, newdata = data.grid[,1:2], type = "vector")

data.grid$prob.tree.pruned = p[,2]

plot pruned tree

p.tree.pruned <- p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.tree.pruned), breaks=c(0.5)) 
p.tree.pruned

Random Forest

library('ranger')

# tune random forest (mtry) manually
mse.rfs <- rep(0, 2)
for(m in 1:2){
  rf <- ranger(y ~ ., data=data.train, mtry=m, probability = TRUE)
  mse.rfs[m] <- rf$prediction.error
}
plot(1:2, mse.rfs, type="b", xlab="mtry", ylab="OOB Error")

mse.rfs # 1 is the best
## [1] 0.1452021 0.1466274

predictions

rf <- ranger(y ~ ., data=data.train, mtry=1, probability = TRUE)
p = predictions(predict(rf, data.grid[,1:2]))

data.grid$prob.rf = p[,2]

plot rf

p.rf <- p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.rf), breaks=c(0.5)) 
p.rf

Boosting Classification

library(xgboost)

#setting up xbg.DMatrix
dtrain <- xgb.DMatrix(data = as.matrix(data.train[1:2]), label = as.matrix(data.train$y))

parameter tuning 100 rounds, random selection of parameters

best.param = list()
best.cv = Inf
best.cv.index = 0
best.nround = 0

for (iter in 1:100) {
  param <- list(objective = "binary:logistic",
                max_depth = sample(1:10, 1),
                eta = runif(1, 0.001, 0.05),
                subsample = runif(1, .5, .9),
                colsample_bytree = runif(1, .5, .9)
  )
  cv.nround = sample(1000:10000,1)
  mdcv <- xgb.cv(data=dtrain, params = param, nthread=16, 
                 nfold=10, nrounds=cv.nround,
                 verbose = FALSE, early_stopping_rounds=100)
  
  min.cv = min(mdcv$evaluation_log$test_error_mean)
  min.cv.index = which.min(mdcv$evaluation_log$test_error_mean)
  
  if (min.cv < best.cv) {
    best.cv = min.cv
    best.cv.index = min.cv.index
    best.param = param
  }
}

# best model 
nround = best.cv.index
best.boost <- xgboost(data=dtrain, params=best.param, nrounds=nround, nthread=16, verbose = FALSE)

predictions

data.grid$prob.boost = predict(best.boost, as.matrix(data.grid[1:2]))

plot of gbm

p.boost = p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.boost), breaks=c(0.5)) 
p.boost

SVM wtih RBF Kernal(default)

library("e1071")

# tuning with 10 fold cross validation
obj <- tune.svm(y~., data = data.train, gamma = 2^(-1:1), cost = 2^(0:10), probability = TRUE)
obj
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  gamma cost
##      2    2
## 
## - best performance: 0.16
# best svm
svm.best = svm(y ~ ., data = data.train, gamma = 2, cost = 2, probability = TRUE)

predictions

p = predict(svm.best, newdata = data.grid[1:2], probability = TRUE)
data.grid$prob.svm = attr(p,which = 'probabilities')[,2]

plot svm

p.svm = p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.svm), breaks=c(0.5)) 
p.svm

neural net

library('caret')
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
# parameter fitting and best model
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 10)
nn <- train(y ~ ., data = data.train, method = 'nnet', trControl = fitControl, trace = FALSE)
nn
## Neural Network 
## 
## 200 samples
##   2 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 180, 180, 180, 180, 180, 180, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  Accuracy  Kappa
##   1     0e+00  0.7175    0.435
##   1     1e-04  0.7190    0.438
##   1     1e-01  0.7155    0.431
##   3     0e+00  0.7300    0.460
##   3     1e-04  0.7355    0.471
##   3     1e-01  0.7090    0.418
##   5     0e+00  0.7610    0.522
##   5     1e-04  0.7550    0.510
##   5     1e-01  0.7205    0.441
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were size = 5 and decay = 0.

predictions

data.grid$prob.nn = predict(nn, newdata = data.grid[1:2], type = 'prob')$'1'

plot nn

p.nn = p0 + stat_contour(data= data.grid, aes(x=x1, y=x2, z=prob.nn), breaks=c(0.5)) 
p.nn