set.seed(42)
library(randomForest)


# creat dataset
n = 4000      
ntree = 1000


f1 = function(y){0.5*y + 5 * sin(y)}
f2 = function(y){y^2 - 0.2*y^3}
f3 = function(y){ifelse(y > 0, 1, 0)}

## set y distribution
y = rcauchy(n = n)
#y = abs(y)
data_cauchy = data.frame(y = y, x1 = f1(y), x2 = f2(y), x3 = rnorm(n = n) + f3(y))

data = data_cauchy

# training
training = data[1:(3*n/4),] # ntrain:ntest = 75%:25%
test = data[(3*n/4+1):n,]

fit = randomForest(y = training[,1], x = training[,-1], ntree = ntree, keep.inbag = TRUE)

# compare |Yi-RFi(Xi)| with |Y-RF(X)|

plot(density(log10(abs(test$y - predict(fit,test)))), type = "n", ylim = c(0,0.6), xlim = c(-4, 4), main = "",
     xlab = "log10(Absolute Error)")
lines(density(log10(abs(test$y - predict(fit,test)))), col = rgb(0,0,0.5), lwd = 3, lty = 2)
lines(density(log10(abs(training$y - predict(fit)))), col = rgb(1,0,0), lwd = 3, lty = 4)
legend("topright", c("Prediction error", "OOB error"), col = c(rgb(0,0,0.5), col = rgb(1,0,0)), lty = c(2,4), lwd = 3)


# compare RF(X) and RFi(X)
tree_inbag = fit$inbag > 0   # ntrain * ntree

# record Bi 
B = apply(tree_inbag, MARGIN = 1, FUN = sum) # ntrain * 1

# RFi
pred = predict(fit, test, predict.all = TRUE)
tree_pred = pred$individual   # ntest * ntree
# sum(tree_pred[2,]*tree_inbag[1,]/B[1])  # Example: RF\1(2)
RF_oob_pred = matrix(0, nrow = nrow(test), ncol = nrow(training))
renorm_tree_inbag = sweep(tree_inbag, MARGIN = 1, STATS = B, FUN = "/")
for(i in 1:nrow(test)){
  for(j in 1:nrow(training)){
    RF_oob_pred[i,j] = sum(tree_pred[i,] * renorm_tree_inbag[j,]) # jth RF oob predictor on ith test point
  }
}
RF_pred = pred$aggregate


# empirical distributions from |RF_oob_pred[,j] - RF_pred|
diff = sweep(RF_oob_pred, MARGIN = 1, STATS = RF_pred, FUN = "-")   # ntest * ntrain
log10_abs_diff = log10(abs(diff))
plot(density(log10_abs_diff[,1]), type = "n", ylim = c(0,0.8), xlim = c(-10, 5), main = "",
     xlab = "log10(Absolute Difference)")
for(i in 1:ncol(diff)){
  lines(density(c(log10_abs_diff[,i])), col = rgb(0.7, 0.7, 0.5+(i/ncol(diff))/2.5, 0.1), lwd = 0.1)
}

# estimate epsilon_n,B for given nu_n,B
# for a given nu_n,B, we calc max epsilon s.t. for each i, it holds that |RF(X)-RFi(X)| > epsilon
nu = 0.05
epsilon = rep(0, nrow(diff))
for(i in 1:nrow(diff)){
  epsilon[i] = quantile(ecdf(log10_abs_diff[,i]), 1-nu)
}
(eps = 10^(max(epsilon)))

     