set.seed(42)
library(randomForest)


All_Data = readRDS("All_Data.rds")
data = All_Data$Airfoil    # select the dataset from All_Data
n = nrow(data)
data = data[sample(n),]
main_title = "Airfoil"
ntree = 1000

# density plot for y
plot(density(data$y), main = main_title, xlab = "y", lwd = 2)

# training
training = data[1:(1*n/2),]
test = data[(1*n/2+1):n,]

fit = randomForest(y = training[,1], x = training[,-1], ntree = ntree, keep.inbag = TRUE)


plot(density(abs(test$y - predict(fit,test))), type = "n", main = main_title, ylim=c(0,0.22),
     xlab = "Absolute Error")
lines(density(abs(test$y - predict(fit,test))), col = rgb(0,0,0.5), lwd = 3, lty = 2)
lines(density(abs(training$y - predict(fit))), col = rgb(1,0,0), lwd = 3, lty = 4)
legend("topright", c("Prediction error", "OOB error"), col = c(rgb(0,0,0.5), col = rgb(1,0,0)), lty = c(2,4), lwd = 3)


# compare RF(X) and RFi(X)
tree_inbag = fit$inbag > 0   # ntrain * ntree

# record B
B = apply(tree_inbag, MARGIN = 1, FUN = sum) # ntrain * 1


# RFi
pred = predict(fit, test, predict.all = TRUE)
tree_pred = pred$individual   # ntest * ntree
# sum(tree_pred[2,]*tree_inbag[1,]/B[1])  # Example: RF\1(2)
RF_oob_pred = matrix(0, nrow = nrow(test), ncol = nrow(training))
renorm_tree_inbag = sweep(tree_inbag, MARGIN = 1, STATS = B, FUN = "/")
for(i in 1:nrow(test)){
  for(j in 1:nrow(training)){
    RF_oob_pred[i,j] = sum(tree_pred[i,] * renorm_tree_inbag[j,]) # jth RF oob predictor on ith test point
  }
}
RF_pred = pred$aggregate


# empirical distributions from |RF_oob_pred[,j] - RF_pred|
diff = sweep(RF_oob_pred, MARGIN = 1, STATS = RF_pred, FUN = "-")   # ntest * ntrain
abs_diff = abs(diff)
plot(density(abs_diff[,1]), type = "n", ylim = c(0, 11), xlim = c(-0.05, 1), main = main_title,
     xlab = "Absolute Difference")
for(i in 1:ncol(diff)){
  lines(density(c(abs_diff[,i])), col = rgb(0.7, 0.7, 0.5+(i/ncol(diff))/2.5, 0.1), lwd = 0.1)
}

# estimate epsilon_n,B for given 1 - nu_n,B
# for a given nu_n,B, we calc max epsilon s.t. for each i, it holds that |RF(X)-RFi(X)| > epsilon
nu = 0.05
epsilon = rep(0, nrow(diff))
for(i in 1:nrow(diff)){
  epsilon[i] = quantile(ecdf(abs_diff[,i]), 1-nu)
}
(eps = max(epsilon))


