Given \(X_i\sim \mathcal{Uniform}[-1,1]\) and \(\bar{g}(x)=\mathbb{E}_\mathcal{D}[Ax+B]\), we have \(A=\frac{X_1^2-X_2^2}{X_1-X_2}=X_1+X_2\). Since the equation of a straight line is given by \((X_1+X_2)(x-X_1)+X_1^2=(X_1+X_2)x-X_1X_2\). Hence \(B=-X_1X_2\). Knowing \(\mathbb{E}_\mathcal{D}[X_i]=0\), we get
\[\begin{align*} \bar{g}(x)&=\mathbb{E}_\mathcal{D}[(X_1+X_2)x-X_1X_2]\\ &=(\mathbb{E}_\mathcal{D}[X_1]+\mathbb{E}_\mathcal{D}[X_2])x-\mathbb{E}_\mathcal{D}[X_1]\mathbb{E}_\mathcal{D}[X_2]\\ &=0 \end{align*}\]#load libraries
library('ggplot2');
#generating datasets
N=1000;
set.seed(12345);
g = data.frame(x1=runif(N,min = -1,max = 1),x2=runif(N,min = -1, max=1));
g$y1 = g$x1^2;
g$y2 = g$x2^2;
#finding equation of line
g$a = (g$y1-g$y2)/(g$x1-g$x2);
g$b = g$a*(-g$x1)+g$y1;
#view first few lines of dataframe after finding slopes and intercepts
head(g);
## x1 x2 y1 y2 a b
## 1 0.44180779 -0.8444866 0.195194126 0.71315761 -0.4026788 0.37310076
## 2 0.75154639 0.6874273 0.564821971 0.47255629 1.4389737 -0.51663350
## 3 0.52196466 -0.9863726 0.272447103 0.97293097 -0.4644080 0.51485165
## 4 0.77224913 -0.2976316 0.596368722 0.08858456 0.4746175 0.22984574
## 5 -0.08703808 0.3721221 0.007575627 0.13847483 0.2850840 0.03238879
## 6 -0.66725643 -0.4918270 0.445231143 0.24189381 -1.1590834 -0.32817474
#plotting lines
ggplot(data = g, aes(x1,y1)) + xlab('x') + ylab('y') +
geom_abline(aes(slope = a, intercept = b), colour = "gray") +
stat_function(fun=function(x)x^2, size = 1);
#finding gbar (we can do it like this because expectation is linear: E_D[ax+b]=E_D[a]x+E_D[b])
g$aBar = mean(g$a);
g$bBar = mean(g$b);
#generate test set from population
f = data.frame(X=runif(10000,min = -1, max = 1));
#finding sd for each point
f$sd = sapply(f$X, function(x) sd(g$a * x + g$b));
f$mean = sapply(f$X, function(x) mean(x*g$a + g$b));
#finding +sd and -sd for each point
f$up=f$mean+f$sd;
f$low=f$mean-f$sd;
#plot gBar+-sd
ggplot(data = f, aes(X) ) +
xlab('x') +ylab('y') + geom_ribbon(aes(ymin=low,ymax=up),fill='grey80') +
geom_abline(data = g, aes(slope = mean(a), intercept = mean(b)), colour='red', size = 1) +
stat_function(fun=function(x) {x^2}, size = 1) +
annotate("text",x=0,y=0.2, label='x ^ 2', parse=TRUE,size=5) +
annotate("text",x=0,y=-0.2, label='bar(g)(x)', colour='red', parse=TRUE,size=5);
#find E_out for each D
g$Eout = mapply(function(x,y) mean((x*f$X+y-(f$X)^2)^2), g$a, g$b);
#find E[E_out]
mean(g$Eout);
## [1] 0.5371955
#find bias
mean((g$aBar*f$X+g$bBar-f$X^2)^2)
## [1] 0.2052436
#find var
g$varx = mapply(function(x,y) mean((x*f$X+y-g$aBar*f$X-g$bBar)^2), g$a, g$b);
mean(g$varx)
## [1] 0.331952
\(\mathbb{E}[E_{out}] =\) bias \(+\) var.