{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "de728f18-a6fd-4e42-9592-e4fd91f2dc20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import Packages\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from xgboost import XGBRegressor\n",
    "from joblib import Parallel, delayed\n",
    "from sklearn.model_selection import train_test_split\n",
    "from scipy.stats import norm\n",
    "from rpy2.robjects.packages import importr\n",
    "from rpy2.robjects import numpy2ri\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "numpy2ri.activate()\n",
    "\n",
    "Balancesampling = importr('BalancedSampling')\n",
    "cube = Balancesampling.cube"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7bee6e7-cfd9-4c76-b32e-99b8e06b86ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Definition of Variance Estimator\n",
    "def Var_bern(Y, Yhat, xi, pi):\n",
    "    selected = np.where(xi == 1)[0]\n",
    "    residuals_sq = (Y[selected] - Yhat[selected]) ** 2\n",
    "    terms = residuals_sq * (1 - pi[selected]) / (pi[selected] ** 2)\n",
    "    variance_estimate = terms.sum() / (len(Y) ** 2)\n",
    "    return variance_estimate\n",
    "\n",
    "def Var_cube(Y, X, xi, pi, N, max_iter=100, tol=1e-6):\n",
    "    selected = np.where(xi == 1)[0]\n",
    "    Y_sample = Y[selected]\n",
    "    X_sample = X[selected, :]\n",
    "    pi_sample = pi[selected]\n",
    "\n",
    "    n, p = len(Y_sample), X_sample.shape[1]\n",
    "    c = (n / (n - p)) * (1 - pi_sample)\n",
    "\n",
    "    for _ in range(max_iter):\n",
    "        M = sum((c[i] / pi_sample[i] ** 2) * np.outer(X_sample[i], X_sample[i]) for i in range(n))\n",
    "        c_new = np.zeros(n)\n",
    "        M_inv = np.linalg.inv(M)\n",
    "\n",
    "        for k in range(n):\n",
    "            x_k = X_sample[k]\n",
    "            A = (x_k @ M_inv @ x_k) / pi_sample[k]\n",
    "            disc = 1 + 4 * A * (1 - pi_sample[k])\n",
    "            if disc < 0 or A == 0:\n",
    "                c_new[k] = 1 - pi_sample[k]\n",
    "            else:\n",
    "                c_new[k] = (-1 + np.sqrt(disc)) / (2 * A)\n",
    "\n",
    "        if np.max(np.abs(c_new - c)) < tol:\n",
    "            break\n",
    "        c = c_new\n",
    "\n",
    "    weights = c / (pi_sample ** 2)\n",
    "    XtW = (X_sample.T * weights)\n",
    "    XtWX = XtW @ X_sample\n",
    "    XtWY = XtW @ Y_sample\n",
    "    b = np.linalg.solve(XtWX, XtWY)\n",
    "    residuals = Y_sample - X_sample @ b\n",
    "\n",
    "    variance_estimate = (c * residuals ** 2 / pi_sample ** 2).sum() / (N ** 2)\n",
    "    return variance_estimate"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2f05b73-1d55-41d3-bfcb-60594bb801d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import & Split Dataset\n",
    "np.random.seed(0)\n",
    "data = pd.read_csv('bike.csv').dropna()\n",
    "data.head()\n",
    "\n",
    "selected_features = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', \n",
    "                    'weathersit', 'temp', 'atemp', 'hum', 'windspeed', \n",
    "                    'casual', 'registered']\n",
    "\n",
    "X_all = data[selected_features].values\n",
    "y_all = data.cnt.values\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.5, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3929a17-eb84-49e1-81de-049fc92485c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train Models & Initial Parameters\n",
    "model = XGBRegressor(n_estimators=1000, learning_rate=0.001, max_depth=7, random_state=0)\n",
    "model.fit(X_train, y_train)\n",
    "Yhat = model.predict(X_test)\n",
    "Yhat_tr = model.predict(X_train)\n",
    "err_tr = Yhat_tr - y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff1d82f5-c7b0-4ee8-8d1f-75fd206ecf87",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train error model\n",
    "model_err = XGBRegressor(n_estimators=1000, learning_rate=0.001, max_depth=7, random_state=0)\n",
    "model_err.fit(X_train, err_tr)\n",
    "ghat = model_err.predict(X_test)\n",
    "uncertainty = np.abs(ghat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90a39e50-0c43-49c0-8141-f965bcfaf1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate statistics\n",
    "theta_true = y_test.mean()\n",
    "N = len(y_test)\n",
    "qn = norm.ppf(0.95)\n",
    "tau = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "130c8598-e155-4678-a55c-ad003cb85a38",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Experiment\n",
    "# Single Trial\n",
    "def single_trial(seed, pi, pe, budget, qn, theta_true, y_test, Yhat, ghat, uncertainty, N):\n",
    "    np.random.seed(seed)\n",
    "\n",
    "    results_bias_sq = []\n",
    "    results_ci = []\n",
    "    results_cv = []\n",
    "    results_lb = []\n",
    "    results_hb = []\n",
    "\n",
    "    methods = ['uniform', 'poisson_active', 'cube_active', 'classical']\n",
    "\n",
    "    # Poisson Uniform Sampling\n",
    "    xi_1 = np.random.binomial(1, pe)\n",
    "    est_1 = (Yhat + (y_test - Yhat) * xi_1 / pe).mean()\n",
    "    Std_1 = np.sqrt(Var_bern(y_test, Yhat, xi_1, pe))\n",
    "    Cf_1 = qn * Std_1\n",
    "    lb_1, hb_1 = est_1 - Cf_1, est_1 + Cf_1\n",
    "    results_bias_sq.append((est_1 - theta_true)**2)\n",
    "    results_ci.append(2 * Cf_1)\n",
    "    results_cv.append(int(lb_1 < theta_true < hb_1))\n",
    "    results_lb.append(lb_1)\n",
    "    results_hb.append(hb_1)\n",
    "\n",
    "    # Poisson Active Sampling\n",
    "    xi_2 = np.random.binomial(1, pi)\n",
    "    est_2 = (Yhat + (y_test - Yhat) * xi_2 / pi).mean()\n",
    "    Std_2 = np.sqrt(Var_bern(y_test, Yhat, xi_2, pi))\n",
    "    Cf_2 = qn * Std_2\n",
    "    lb_2, hb_2 = est_2 - Cf_2, est_2 + Cf_2\n",
    "    results_bias_sq.append((est_2 - theta_true)**2)\n",
    "    results_ci.append(2 * Cf_2)\n",
    "    results_cv.append(int(lb_2 < theta_true < hb_2))\n",
    "    results_lb.append(lb_2)\n",
    "    results_hb.append(hb_2)\n",
    "\n",
    "    # Cube Active Sampling Based On Yhat\n",
    "    #Yhat_r = numpy2ri.py2rpy(np.reshape(Yhat, (-1, 1)))\n",
    "    un_r = numpy2ri.py2rpy(np.reshape(ghat, (-1, 1)))\n",
    "    pi_r = numpy2ri.py2rpy(pi)\n",
    "    cube_result = cube(un_r, prob=pi_r)\n",
    "    xi_3 = np.zeros(N)\n",
    "    xi_3[np.array(cube_result).astype(int) - 1] = 1\n",
    "    est_3 = (Yhat + (y_test - Yhat) * xi_3 / pi).mean()\n",
    "    Std_3 = np.sqrt(Var_cube(y_test - Yhat, ghat.reshape(-1, 1), xi_3, pi, N))\n",
    "    Cf_3 = qn * Std_3\n",
    "    lb_3, hb_3 = est_3 - Cf_3, est_3 + Cf_3\n",
    "    results_bias_sq.append((est_3 - theta_true)**2)\n",
    "    results_ci.append(2 * Cf_3)\n",
    "    results_cv.append(int(lb_3 < theta_true < hb_3))\n",
    "    results_lb.append(lb_3)\n",
    "    results_hb.append(hb_3)\n",
    "\n",
    "    # Classical Simple Random Sampling\n",
    "    xi_4 = np.random.binomial(1, pe)\n",
    "    selected_samples_4 = y_test[xi_4 == 1]\n",
    "    est_4 = selected_samples_4.mean()\n",
    "    m = xi_4.sum()\n",
    "    sample_var_4 = selected_samples_4.var(ddof=1)\n",
    "    Std_4 = np.sqrt((1 - budget) * sample_var_4 / m)\n",
    "    Cf_4 = qn * Std_4\n",
    "    lb_4, hb_4 = est_4 - Cf_4, est_4 + Cf_4\n",
    "    results_bias_sq.append((est_4 - theta_true)**2)\n",
    "    results_ci.append(2 * Cf_4)\n",
    "    results_cv.append(int(lb_4 < theta_true < hb_4))\n",
    "    results_lb.append(lb_4)\n",
    "    results_hb.append(hb_4)\n",
    "\n",
    "    return results_bias_sq, results_ci, results_cv, results_lb, results_hb\n",
    "\n",
    "# Simulation N Trials\n",
    "def run_simulation(budget, N_trial, qn, theta_true, y_test, Yhat, ghat, uncertainty, tau, N):\n",
    "    pi = np.minimum(tau * uncertainty / uncertainty.mean() * budget + (1 - tau) * budget, 1)\n",
    "    pe = np.full(N, budget)\n",
    "\n",
    "    results = Parallel(n_jobs=10)(\n",
    "        delayed(single_trial)(seed, pi, pe, budget, qn, theta_true, y_test, Yhat, ghat, uncertainty, N)\n",
    "        for seed in range(N_trial))\n",
    "\n",
    "    Result1 = np.sqrt(np.mean([r[0] for r in results], axis=0))\n",
    "    Result2 = np.mean([r[1] for r in results], axis=0)\n",
    "    Result3 = np.mean([r[2] for r in results], axis=0)\n",
    "    Result4 = np.mean([r[3] for r in results], axis=0)\n",
    "    Result5 = np.mean([r[4] for r in results], axis=0)\n",
    "\n",
    "    return Result1, Result2, Result3, Result4, Result5, budget, len(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3debc944-5a38-4b5e-8f02-3e15569e711e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Main\n",
    "np.random.seed(0)\n",
    "N_trials = 10000\n",
    "budgets = np.arange(0.03, 0.46, 0.01)\n",
    "results_list = [run_simulation(b, N_trials, qn, theta_true, y_test, Yhat, ghat, uncertainty, tau, N) for b in budgets]\n",
    "\n",
    "columns = ['uniform', 'poisson_active', 'cube_active', 'classical', 'budget', 'SuccessCount']\n",
    "Result1 = pd.DataFrame([x[0].tolist() + [x[5], x[6]] for x in results_list], columns=columns)\n",
    "Result2 = pd.DataFrame([x[1].tolist() + [x[5], x[6]] for x in results_list], columns=columns)\n",
    "Result3 = pd.DataFrame([x[2].tolist() + [x[5], x[6]] for x in results_list], columns=columns)\n",
    "Result4 = pd.DataFrame([x[3].tolist() + [x[5], x[6]] for x in results_list], columns=columns)\n",
    "Result5 = pd.DataFrame([x[4].tolist() + [x[5], x[6]] for x in results_list], columns=columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "88a80849-e0a4-4d8c-848a-b7bebd55d580",
   "metadata": {},
   "outputs": [],
   "source": [
    "Result1.to_csv(\"Bike_Result_g1.csv\", index=False)\n",
    "Result2.to_csv(\"Bike_Result_g2.csv\", index=False)\n",
    "Result3.to_csv(\"Bike_Result_g3.csv\", index=False)\n",
    "Result4.to_csv(\"Bike_Result_g4.csv\", index=False)\n",
    "Result5.to_csv(\"Bike_Result_g5.csv\", index=False)\n",
    "\n",
    "print(Result1)\n",
    "print(Result2)\n",
    "print(Result3)\n",
    "print(Result4)\n",
    "print(Result5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f588b958-f1de-4e7e-9fcb-8ca878a3e8dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# bike\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Load data\n",
    "Result1 = pd.read_csv(\"Bike_Result_g1.csv\")\n",
    "Result2 = pd.read_csv(\"Bike_Result_g2.csv\")\n",
    "Result3 = pd.read_csv(\"Bike_Result_g3.csv\")\n",
    "\n",
    "# sns.set(style=\"whitegrid\", font_scale=2.2)\n",
    "\n",
    "N = len(y_test)\n",
    "font_size = 18\n",
    "\n",
    "methods = ['classical', 'uniform', 'poisson_active', 'cube_active']\n",
    "colors = {'uniform': 'red', 'poisson_active': 'green', 'cube_active': 'blue', 'classical': 'orange'}\n",
    "\n",
    "fig, axes = plt.subplots(1, 3, figsize=(15, 5))\n",
    "\n",
    "def plot_results(ax, data, ylabel, log_scale=False, ylim_zero=False):\n",
    "    for method in methods:\n",
    "        label = 'cube_active (ours)' if method == 'cube_active' else method\n",
    "        ax.plot(data['budget'] * N, data[method], label=label, color=colors[method], linewidth=2)\n",
    "\n",
    "    ax.set_xlabel(\"Sample Size\", fontsize=font_size)\n",
    "    ax.set_ylabel(ylabel, fontsize=font_size)\n",
    "    ax.tick_params(axis='both', labelsize=font_size)\n",
    "    \n",
    "    # ax.set_xscale('log')\n",
    "    if log_scale:\n",
    "        ax.set_yscale('log')\n",
    "    if ylim_zero:\n",
    "        ax.set_ylim(bottom=0)\n",
    "    ax.grid(True, linestyle='--', linewidth=1)\n",
    "\n",
    "# RMSE (Result1)\n",
    "plot_results(axes[0], Result1, \"RMSE\", log_scale=False)\n",
    "axes[0].set_title(\"(a)\", fontsize = font_size)\n",
    "\n",
    "# Interval Length (Result2)\n",
    "plot_results(axes[1], Result2, \"Interval Width\", log_scale=False)\n",
    "axes[1].set_title(\"(b)\", fontsize = font_size)\n",
    "\n",
    "# Coverage Rate (Result3)\n",
    "plot_results(axes[2], Result3, \"Coverage Rate\", log_scale=False, ylim_zero=True)\n",
    "axes[2].set_title(\"(c)\", fontsize = font_size)\n",
    "axes[2].set_ylim(0, 1)  \n",
    "axes[2].set_yticks([0, 0.5, 0.8, 0.9, 1])\n",
    "\n",
    "for ax in axes:\n",
    "    ax.legend_.remove() if ax.get_legend() else None\n",
    "\n",
    "handles, labels = axes[0].get_legend_handles_labels()\n",
    "fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.5, -0.08),\n",
    "           ncol=4, frameon=False, fontsize=font_size)\n",
    "\n",
    "plt.tight_layout()\n",
    "# plt.subplots_adjust(bottom=0.2) \n",
    "plt.savefig(\"Bike_result.pdf\", \n",
    "             format='pdf',\n",
    "             bbox_inches='tight',\n",
    "             dpi=3000)\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
