Run for one dataset first for simple analysis

Data generating model

set.seed(13579)

# dimensionality of the data
N <- 20  # number of data points

# the true weights (w1, w2)
w <- c(2, -4)
b <- 0.5

## generate X
# x is a matrix with 2 rows and N columns; each column is one data point
x <- matrix(nrow = 2, ncol = N)
# x_1 to x_2 are Normal(0, 1) to avoid the need of scaling data
x[1, ] <- rnorm(N, mean=0, sd=1)
x[2, ] <- rnorm(N, mean=0, sd=1)

## generate Y
y <- as.vector(ifelse(t(w) %*% x + b > 0, 1, -1))

Perceptron Learning Algorithm

pla <- function(x, y, verbose=0) {
  
  x <- rbind(x, 1)
  
  # initialize w.hat
  w.hat <- c(0, 0, 0)
  t <- 0
  
  # start loop
  while(TRUE) {
    y.hat <- ifelse(t(w.hat) %*% x > 0, 1, -1)
    errors <- (y.hat != y)
    if (verbose)
      cat("Iter:", t, " w1 =", w.hat[1], " w2 =", w.hat[2], " w0 =", w.hat[3], " errors =", sum(errors))
    t <- t + 1
    
    if(sum(errors) == 0) {
      # no more classification error, algorithm stops
      break
    } else {
      # choose an error point randomly to update w
      point <- sample(which(errors == TRUE), 1)
      ## alternatively, choose the first error point to update w
      # point <- which(errors == TRUE)[1]
      
      if (verbose)
        cat("  point selected:", point, "\n")
      
      # update w
      w.hat <- w.hat + y[point] * x[, point]
    }
  }
  return(w.hat)
}

w.pla <- pla(x,y)

SVM

data <- data.frame(x1=x[1, ], x2=x[2, ], y=y)
svm1 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=1e+10, scale=FALSE)

# coefficients
w.svm <- colSums(as.vector(svm1$coefs) * svm1$SV)
b.svm <- - svm1$rho

# sanity check: should be 1, otherwise it is label switching
sgn <- sign((w.svm %*% svm1$SV[3,] + b.svm) * y[svm1$index[3]])

# margin
margin <- 1 / sqrt(t(w.svm) %*% w.svm)

# different SVM models
svm2 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=10, scale=FALSE)
svm3 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=1, scale=FALSE)
svm4 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=0.1, scale=FALSE)
svm5 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=0.03, scale=FALSE)

# getting coefficients
# 2
w.svm2 <- colSums(as.vector(svm2$coefs) * svm2$SV)
b.svm2 <- - svm2$rho
margin2 <- 1 / sqrt(t(w.svm2) %*% w.svm2)
# sanity check: should be 1, otherwise it is label switching
sgn2 <- sign((w.svm2 %*% svm2$SV[3,] + b.svm2) * y[svm2$index[3]])

# 3
w.svm3 <- colSums(as.vector(svm3$coefs) * svm3$SV)
b.svm3 <- - svm3$rho
margin3 <- 1 / sqrt(t(w.svm3) %*% w.svm3)
# sanity check: should be 1, otherwise it is label switching
sgn3 <- sign((w.svm3 %*% svm3$SV[3,] + b.svm3) * y[svm3$index[3]])

# 4
w.svm4 <- colSums(as.vector(svm4$coefs) * svm4$SV)
b.svm4 <- - svm4$rho
margin4 <- 1 / sqrt(t(w.svm4) %*% w.svm4)
# sanity check: should be 1, otherwise it is label switching
sgn4 <- sign((w.svm4 %*% svm4$SV[3,] + b.svm4) * y[svm4$index[3]])

# 5
w.svm5 <- colSums(as.vector(svm5$coefs) * svm5$SV)
b.svm5 <- - svm5$rho
margin5 <- 1 / sqrt(t(w.svm5) %*% w.svm5)
# sanity check: should be 1, otherwise it is label switching
sgn5 <- sign((w.svm5 %*% svm5$SV[3,] + b.svm5) * y[svm5$index[3]])

Generating test dataset

# generate test data
x.out <- matrix(nrow = 2, ncol = 10000)
# x_1 to x_2 are Normal(0, 1) to avoid the need of scaling data
x.out[1, ] <- rnorm(10000, mean=0, sd=1)
x.out[2, ] <- rnorm(10000, mean=0, sd=1)

## generate y.out
y.out <- as.vector(ifelse(t(w) %*% x.out + b > 0, 1, -1))

data.out <- data.frame(x1=x.out[1, ], x2=x.out[2, ], y=y.out)

Visualizing the margins and lines on test data

ggplot(data = data.out, aes(x1,x2)) + geom_point(aes(color = factor(y), shape = factor(y))) + 
  geom_ribbon(aes(ymin=-w.svm3["x1"] / w.svm3["x2"]*x1-b.svm3 / w.svm3["x2"]-as.numeric(margin3),ymax=-w.svm3["x1"] / w.svm3["x2"]*x1-b.svm3 / w.svm3["x2"]+as.numeric(margin3)),fill="yellow", alpha = 0.3) +
  geom_abline(intercept = -b.svm3 / w.svm3["x2"], slope = -w.svm3["x1"] / w.svm3["x2"], color='yellow') +
  geom_abline(intercept = -w.pla[3] / w.pla[2], slope = -w.pla[1] / w.pla[2]) + 
  geom_abline(intercept = -b.svm5 / w.svm5["x2"], slope = -w.svm5["x1"] / w.svm5["x2"], color='blue') +
  geom_abline(intercept = -b.svm4 / w.svm4["x2"], slope = -w.svm4["x1"] / w.svm4["x2"], color = 'green') + 
  geom_abline(intercept = -b.svm / w.svm["x2"], slope = -w.svm["x1"] / w.svm["x2"], color = 'red') +
  geom_abline(intercept = -b.svm2 / w.svm2["x2"], slope = -w.svm2["x1"] / w.svm2["x2"], color = 'orange') +
  geom_ribbon(aes(ymin=-w.svm["x1"] / w.svm["x2"]*x1-b.svm / w.svm["x2"]-as.numeric(margin),ymax=-w.svm["x1"] / w.svm["x2"]*x1-b.svm / w.svm["x2"]+as.numeric(margin)),fill="red", alpha = 0.5) + 
  geom_ribbon(aes(ymin=-w.svm2["x1"] / w.svm2["x2"]*x1-b.svm2 / w.svm2["x2"]-as.numeric(margin2),ymax=-w.svm2["x1"] / w.svm2["x2"]*x1-b.svm2 / w.svm2["x2"]+as.numeric(margin2)),fill="orange", alpha = 0.4) + 
  geom_ribbon(aes(ymin=-w.svm4["x1"] / w.svm4["x2"]*x1-b.svm4 / w.svm4["x2"]-as.numeric(margin4),ymax=-w.svm4["x1"] / w.svm4["x2"]*x1-b.svm4 / w.svm4["x2"]+as.numeric(margin4)),fill="green", alpha = 0.15) +
  geom_ribbon(aes(ymin=-w.svm5["x1"] / w.svm5["x2"]*x1-b.svm5 / w.svm5["x2"]-as.numeric(margin5),ymax=-w.svm5["x1"] / w.svm5["x2"]*x1-b.svm5 / w.svm5["x2"]+as.numeric(margin5)),fill="blue", alpha = 0.15) 

Here, the perceptron line is in black. The soft SVMs are plotted with cost = 1e+10, 10, 1, 0.1, and 0.03 represented by red, orange, yellow, green, and blue lines respectively. The ribbons represent the length of the margins. The costs are as chosen to give a visible change in the length of the margin. We see that the blue margin corresponding to the lowest cost is the largest, and the red margin corresponding to the highest cost is the smallest. We give also the margins and classification errors:

# misclassification
y.test=as.numeric(sgn)*sign(colSums(w.svm*x.out) + b.svm)
y.test2=as.numeric(sgn2)*sign(colSums(w.svm2*x.out) + b.svm2)
y.test3=as.numeric(sgn3)*sign(colSums(w.svm3*x.out) + b.svm3)
y.test4=as.numeric(sgn4)*sign(colSums(w.svm4*x.out) + b.svm4)
y.test5=as.numeric(sgn5)*sign(colSums(w.svm5*x.out) + b.svm5)

class.error = sum(y.out != y.test)/10000
class.error2 = sum(y.out != y.test2)/10000
class.error3 = sum(y.out != y.test3)/10000
class.error4 = sum(y.out != y.test4)/10000
class.error5 = sum(y.out != y.test5)/10000
c 1e+10 10 1 0.1 0.03
margin 0.1787767 0.2305494 0.5556216 1.1594112 2.3335075
classification error 0.0477 0.0452 0.104 0.089 0.3153

Running for 100 datasets

initializing for memory

RUN = 100 
pla.D = matrix(rep(0,RUN*3), nrow=RUN, ncol=3)
svm.D = matrix(rep(0,RUN*3), nrow=RUN, ncol=3)
svm2.D = matrix(rep(0,RUN*3), nrow=RUN, ncol=3)
svm3.D = matrix(rep(0,RUN*3), nrow=RUN, ncol=3)
svm4.D = matrix(rep(0,RUN*3), nrow=RUN, ncol=3)
svm5.D = matrix(rep(0,RUN*3), nrow=RUN, ncol=3)

Faster PLA code

#faster PLA
### Perceptron Learning Algorithm
pla <- function(x, y) {
  
  x <- rbind(x, 1)
  
  # initialize w.hat
  w.hat <- c(0, 0, 0)
  
  # start loop
  while(TRUE) {
    y.hat <- ifelse(t(w.hat) %*% x > 0, 1, -1)
    errors <- (y.hat != y)
    
    if(sum(errors) == 0) {
      # no more classification error, algorithm stops
      break
    } else {
      # choose an error point randomly to update w
      # point <- sample(which(errors == TRUE), 1)
      ## alternatively, choose the first error point to update w
      point <- which(errors == TRUE)[1]
    
      # update w
      w.hat <- w.hat + y[point] * x[, point]
    }
  }
  return(w.hat)
}

Actual loop for 100 datasets

for (run in 1:RUN) {
  x[1, ] <- rnorm(N, mean=0, sd=1)
  x[2, ] <- rnorm(N, mean=0, sd=1)
  
  ## generate Y
  y <- as.vector(ifelse(t(w) %*% x + b > 0, 1, -1)) 
  
  #PLA
  pla.D[run, ] <- pla(x,y)
  
  #SVM
  data <- data.frame(x1=x[1, ], x2=x[2, ], y=y)
  
  svm1 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=1e+10, scale=FALSE)
  svm2 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=10, scale=FALSE)
  svm3 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=1, scale=FALSE)
  svm4 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=0.1, scale=FALSE)
  svm5 <- svm(factor(y) ~ x1 + x2, data, type="C-classification", kernel="linear", cost=0.03, scale=FALSE)
  
  # getting coefficients
  # 1
  w.svm1 <- colSums(as.vector(svm1$coefs) * svm1$SV)
  b.svm1 <- - svm1$rho
  margin1 <- 1 / sqrt(t(w.svm1) %*% w.svm1)
  # sanity check: should be 1, otherwise it is label switching
  sgn1 <- sign((w.svm1 %*% svm1$SV[1,] + b.svm1) * y[svm1$index[1]])
  svm.D[run, 1:2] <- as.numeric(sgn1)*w.svm1
  svm.D[run, 3] <- as.numeric(sgn1)*b.svm1
  
  # 2
  w.svm2 <- colSums(as.vector(svm2$coefs) * svm2$SV)
  b.svm2 <- - svm2$rho
  margin2 <- 1 / sqrt(t(w.svm2) %*% w.svm2)
  # sanity check: should be 1, otherwise it is label switching
  sgn2 <- sign((w.svm2 %*% svm2$SV[1,] + b.svm2) * y[svm2$index[1]])
  svm2.D[run, 1:2] <- as.numeric(sgn2)*w.svm2
  svm2.D[run, 3] <- as.numeric(sgn2)*b.svm2
  
  # 3
  w.svm3 <- colSums(as.vector(svm3$coefs) * svm3$SV)
  b.svm3 <- - svm3$rho
  margin3 <- 1 / sqrt(t(w.svm3) %*% w.svm3)
  # sanity check: should be 1, otherwise it is label switching
  sgn3 <- sign((w.svm3 %*% svm3$SV[1,] + b.svm3) * y[svm3$index[1]])
  svm3.D[run, 1:2] <- as.numeric(sgn3)*w.svm3
  svm3.D[run, 3] <- as.numeric(sgn3)*b.svm3
  
  # 4
  w.svm4 <- colSums(as.vector(svm4$coefs) * svm4$SV)
  b.svm4 <- - svm4$rho
  margin4 <- 1 / sqrt(t(w.svm4) %*% w.svm4)
  # sanity check: should be 1, otherwise it is label switching
  sgn4 <- sign((w.svm4 %*% svm4$SV[1,] + b.svm4) * y[svm4$index[1]])
  svm4.D[run, 1:2] <- as.numeric(sgn4)*w.svm4
  svm4.D[run, 3] <- as.numeric(sgn4)*b.svm4
  
  
  # 5
  w.svm5 <- colSums(as.vector(svm5$coefs) * svm5$SV)
  b.svm5 <- - svm5$rho
  margin5 <- 1 / sqrt(t(w.svm5) %*% w.svm5)
  # sanity check: should be 1, otherwise it is label switching
  sgn5 <- sign((w.svm5 %*% svm5$SV[1,] + b.svm5) * y[svm5$index[1]])
  svm5.D[run, 1:2] <- as.numeric(sgn5)*w.svm5
  svm5.D[run, 3] <- as.numeric(sgn5)*b.svm5
  
}

Getting the classification errors

class.error.D <- matrix(rep(0,RUN*6), nrow=RUN, ncol=6)

for (run in 1:RUN) {
  y.test.pla.D = sign(colSums(pla.D[run,1:2]*x.out) + pla.D[run,3])
  y.test.D = sign(colSums(svm.D[run,1:2]*x.out) + svm.D[run,3])
  y2.test.D = sign(colSums(svm2.D[run,1:2]*x.out) + svm2.D[run,3])
  y3.test.D = sign(colSums(svm3.D[run,1:2]*x.out) + svm3.D[run,3])
  y4.test.D = sign(colSums(svm4.D[run,1:2]*x.out) + svm4.D[run,3])
  y5.test.D = sign(colSums(svm5.D[run,1:2]*x.out) + svm5.D[run,3])
  
  class.error.D[run,1] <- sum(y.out != y.test.pla.D)/10000
  class.error.D[run,2] <- sum(y.out != y.test.D)/10000
  class.error.D[run,3] <- sum(y.out != y2.test.D)/10000
  class.error.D[run,4] <- sum(y.out != y3.test.D)/10000
  class.error.D[run,5] <- sum(y.out != y4.test.D)/10000
  class.error.D[run,6] <- sum(y.out != y5.test.D)/10000
}
# average error of each algorithm
ave.class.error <- colMeans(class.error.D)

Here, we give the mean classification errors for the 5 SVMs and PLA. We see that the out of sample performance is improved when regularization is applied, but for large cost (c=1e+10). For small costs, the PLA seems to give a better out of sample performance performance, probably because the margin is too large and \(\xi_i\)’s are large (>1).

c 1e+10 10 1 0.1 0.03 PLA
margin 0.1787767 0.2839417 0.4947379 1.5458022 5.1514436 NA
Mean classification error 0.053547 0.151203 0.123547 0.215573 0.419972 0.065875

Recovering boundary

pla.coeff <- data.frame(a = -pla.D[, 3] / pla.D[, 2], b = -pla.D[, 1] / pla.D[, 2])
svm.coeff <- data.frame(a = -svm.D[, 3] / svm.D[, 2], b = -svm.D[, 1] / svm.D[, 2])
svm2.coeff <- data.frame(a = -svm2.D[, 3] / svm2.D[, 2], b = -svm2.D[, 1] / svm2.D[, 2])
svm3.coeff <- data.frame(a = -svm3.D[, 3] / svm3.D[, 2], b = -svm3.D[, 1] / svm3.D[, 2])
svm4.coeff <- data.frame(a = -svm4.D[, 3] / svm4.D[, 2], b = -svm4.D[, 1] / svm4.D[, 2])
svm5.coeff <- data.frame(a = -svm5.D[, 3] / svm5.D[, 2], b = -svm5.D[, 1] / svm5.D[, 2])

Calculating bias(x) and var(x)

# pla
gp_D_x <- as.matrix(pla.coeff) %*% rbind(1,x.out[1, ])
gp_bar_x <- colMeans(gp_D_x)
gp_sd_x <- apply(gp_D_x, 2, sd)
# svm
g1_D_x <- as.matrix(svm.coeff) %*% rbind(1,x.out[1, ])
g1_bar_x <- colMeans(g1_D_x)
g1_sd_x <- apply(g1_D_x, 2, sd)

g2_D_x <- as.matrix(svm2.coeff) %*% rbind(1,x.out[1, ])
g2_bar_x <- colMeans(g2_D_x)
g2_sd_x <- apply(g2_D_x, 2, sd)

g3_D_x <- as.matrix(svm3.coeff) %*% rbind(1,x.out[1, ])
g3_bar_x <- colMeans(g3_D_x)
g3_sd_x <- apply(g3_D_x, 2, sd)

g4_D_x <- as.matrix(svm4.coeff) %*% rbind(1,x.out[1, ])
g4_bar_x <- colMeans(g4_D_x)
g4_sd_x <- apply(g4_D_x, 2, sd)

g5_D_x <- as.matrix(svm5.coeff) %*% rbind(1,x.out[1, ])
g5_bar_x <- colMeans(g5_D_x)
g5_sd_x <- apply(g5_D_x, 2, sd)

visualization

Here are 5 sequatial plots, first for PLA and SVM with c = 1e+10 (red), then adding c = 10 (orange), 1 (yellow), 0.1 (green), and 0.03 (blue) respectively. We see that for decreasing cost from 1e+10 to 1 does not change the bias(x) and variance(x) much. When c is less than 1, the varaince(x) and bias(x) starts to deviate more and this is probably the reason for the poor performance.

ggplot() + geom_line(aes(x=x.out[1, ], y=gp_bar_x), color="black", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=gp_bar_x - gp_sd_x,ymax=gp_bar_x + gp_sd_x), alpha=0.3, fill='black') +
  geom_line(aes(x=x.out[1, ], y=g1_bar_x), color="red", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g1_bar_x - g1_sd_x,ymax=g1_bar_x + g1_sd_x), alpha=0.5, fill='red') +
  geom_point(aes(x=x.out[1, ], y=y.out), color="darkblue", size=1) + labs(x="x", y="y") + 
  theme_bw()

ggplot() + geom_line(aes(x=x.out[1, ], y=gp_bar_x), color="black", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=gp_bar_x - gp_sd_x,ymax=gp_bar_x + gp_sd_x), alpha=0.3, fill='black') +
  geom_line(aes(x=x.out[1, ], y=g1_bar_x), color="red", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g1_bar_x - g1_sd_x,ymax=g1_bar_x + g1_sd_x), alpha=0.5, fill='red') +
  geom_line(aes(x=x.out[1, ], y=g2_bar_x), color="orange", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g2_bar_x - g2_sd_x,ymax=g2_bar_x + g2_sd_x), alpha=0.4, fill='orange') +
  geom_point(aes(x=x.out[1, ], y=y.out), color="darkblue", size=1) + labs(x="x", y="y") + 
  theme_bw()

ggplot() + geom_line(aes(x=x.out[1, ], y=gp_bar_x), color="black", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=gp_bar_x - gp_sd_x,ymax=gp_bar_x + gp_sd_x), alpha=0.3, fill='black') +
  geom_line(aes(x=x.out[1, ], y=g1_bar_x), color="red", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g1_bar_x - g1_sd_x,ymax=g1_bar_x + g1_sd_x), alpha=0.5, fill='red') +
  geom_line(aes(x=x.out[1, ], y=g2_bar_x), color="orange", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g2_bar_x - g2_sd_x,ymax=g2_bar_x + g2_sd_x), alpha=0.4, fill='orange') +
  geom_line(aes(x=x.out[1, ], y=g3_bar_x), color="yellow", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g3_bar_x - g3_sd_x,ymax=g3_bar_x + g3_sd_x), alpha=0.3, fill='yellow') +
  geom_point(aes(x=x.out[1, ], y=y.out), color="darkblue", size=1) + labs(x="x", y="y") + 
  theme_bw()

ggplot() + geom_line(aes(x=x.out[1, ], y=gp_bar_x), color="black", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=gp_bar_x - gp_sd_x,ymax=gp_bar_x + gp_sd_x), alpha=0.3, fill='black') +
  geom_line(aes(x=x.out[1, ], y=g1_bar_x), color="red", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g1_bar_x - g1_sd_x,ymax=g1_bar_x + g1_sd_x), alpha=0.5, fill='red') +
  geom_line(aes(x=x.out[1, ], y=g2_bar_x), color="orange", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g2_bar_x - g2_sd_x,ymax=g2_bar_x + g2_sd_x), alpha=0.4, fill='orange') +
  geom_line(aes(x=x.out[1, ], y=g3_bar_x), color="yellow", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g3_bar_x - g3_sd_x,ymax=g3_bar_x + g3_sd_x), alpha=0.3, fill='yellow') +
  geom_line(aes(x=x.out[1, ], y=g4_bar_x), color="green", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g4_bar_x - g4_sd_x,ymax=g4_bar_x + g4_sd_x), alpha=0.2, fill='green') +
  geom_point(aes(x=x.out[1, ], y=y.out), color="darkblue", size=1) + labs(x="x", y="y") + 
  theme_bw()

ggplot() + geom_line(aes(x=x.out[1, ], y=gp_bar_x), color="black", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=gp_bar_x - gp_sd_x,ymax=gp_bar_x + gp_sd_x), alpha=0.3, fill='black') +
  geom_line(aes(x=x.out[1, ], y=g1_bar_x), color="red", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g1_bar_x - g1_sd_x,ymax=g1_bar_x + g1_sd_x), alpha=0.5, fill='red') +
  geom_line(aes(x=x.out[1, ], y=g2_bar_x), color="orange", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g2_bar_x - g2_sd_x,ymax=g2_bar_x + g2_sd_x), alpha=0.4, fill='orange') +
  geom_line(aes(x=x.out[1, ], y=g3_bar_x), color="yellow", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g3_bar_x - g3_sd_x,ymax=g3_bar_x + g3_sd_x), alpha=0.3, fill='yellow') +
  geom_line(aes(x=x.out[1, ], y=g4_bar_x), color="green", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g4_bar_x - g4_sd_x,ymax=g4_bar_x + g4_sd_x), alpha=0.2, fill='green') +
  geom_line(aes(x=x.out[1, ], y=g5_bar_x), color="blue", size=1) + 
  geom_ribbon(aes(x=x.out[1, ], ymin=g5_bar_x - g5_sd_x,ymax=g5_bar_x + g5_sd_x), alpha=0.1, fill='blue') +
  geom_point(aes(x=x.out[1, ], y=y.out), color="darkblue", size=1) + labs(x="x", y="y") + 
  theme_bw()

Bias and Variance

y.out.value <- as.vector(t(w) %*% x.out + b)

bias.p <- mean((gp_bar_x - y.out.value)^2)
variance.p <- mean((t(gp_D_x) - gp_bar_x)^2)
mse.p <- mean((t(gp_D_x) - y.out.value)^2)

bias.1 <- mean((g1_bar_x - y.out.value)^2)
variance.1 <- mean((t(g1_D_x) - g1_bar_x)^2)
mse.1 <- mean((t(g1_D_x) - y.out.value)^2)

bias.2 <- mean((g2_bar_x - y.out.value)^2)
variance.2 <- mean((t(g2_D_x) - g2_bar_x)^2)
mse.2 <- mean((t(g2_D_x) - y.out.value)^2)

bias.3 <- mean((g3_bar_x - y.out.value)^2)
variance.3 <- mean((t(g3_D_x) - g3_bar_x)^2)
mse.3 <- mean((t(g3_D_x) - y.out.value)^2)

bias.4 <- mean((g4_bar_x - y.out.value)^2)
variance.4 <- mean((t(g4_D_x) - g4_bar_x)^2)
mse.4 <- mean((t(g4_D_x) - y.out.value)^2)

bias.5 <- mean((g5_bar_x - y.out.value)^2)
variance.5 <- mean((t(g5_D_x) - g5_bar_x)^2)
mse.5 <- mean((t(g5_D_x) - y.out.value)^2)
c 1e+10 10 1 0.1 0.03 PLA
Bias 18.1544184 18.1261732 18.1167264 18.1133552 18.7723243 18.1705976
Variance 0.0585361 0.059356 0.0879806 0.5107483 7.7136987 0.0806699
MSE 18.2129544 18.1855292 18.2047069 18.6241035 26.486023 18.2512674

We see that regularization from PLA to SVM with high cost decreases the variance, as it should. Once the cost is too low, the SVM algorithm performs poorly. Hence, the choice of c is important.