{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d955dea9-144d-4c4c-b717-4c675756c17a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ================================================\n",
    "# Full Conformal Bayes with AOI from Fong & Holmes (2021) paper on California Housing Dataset\n",
    "# CB using Parametric Bayesian linear models, or qudratic models, or combination of them\n",
    "# CB with qudratic models and the important features should have best performance.\n",
    "# ================================================\n",
    "\n",
    "import time\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import random\n",
    "\n",
    "from scipy.special import logsumexp\n",
    "from sklearn.datasets import fetch_california_housing\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "\n",
    "\n",
    "\n",
    "def zscore(a, axis=0, eps=1e-12):\n",
    "    m = np.mean(a, axis=axis, keepdims=True)\n",
    "    s = np.std(a, axis=axis, keepdims=True)\n",
    "    return (a - m) / (s + eps), m, s\n",
    "\n",
    "def normal_logpdf(y, mean, var):\n",
    "    \n",
    "    return -0.5 * (np.log(2*np.pi*var) + (y - mean)**2 / var)\n",
    "\n",
    "def invgamma_sample(rng, alpha, beta, size=None):\n",
    "    \n",
    "    g = rng.gamma(shape=alpha, scale=1.0/beta, size=size)\n",
    "    return 1.0 / g\n",
    "\n",
    "\n",
    "\n",
    "def load_california_standardized(random_state=0):\n",
    "    X, y = fetch_california_housing(return_X_y=True)\n",
    "    Xz, Xm, Xs = zscore(X, axis=0)\n",
    "    yz, ym, ys = zscore(y, axis=0)\n",
    "    return Xz, yz, (Xm, Xs, ym, ys)  \n",
    "\n",
    "def sample_train_test(X, y, n_train, m_test, seed):\n",
    "    rng = np.random.default_rng(seed)\n",
    "    N = X.shape[0]\n",
    "    idx = rng.choice(N, size=n_train + m_test, replace=False)\n",
    "    tr_idx = idx[:n_train]\n",
    "    te_idx = idx[n_train:]\n",
    "    return X[tr_idx], y[tr_idx], X[te_idx], y[te_idx]\n",
    "\n",
    "\n",
    "\n",
    "def design_M1_full(X):\n",
    "    # intercept + all 8 features\n",
    "    n = X.shape[0]\n",
    "    return np.column_stack([np.ones(n), X])\n",
    "\n",
    "def design_M2_small(X):\n",
    "    # intercept + important feature (MedInc, HouseAge) \n",
    "    n = X.shape[0]\n",
    "    return np.column_stack([np.ones(n), X[:, [0, 1]]])\n",
    "\n",
    "_poly = PolynomialFeatures(degree=2, include_bias=True)  \n",
    "def design_M3_quadratic(X):\n",
    "    # PolynomialFeatures on all 8 standardized predictors \n",
    "    return _poly.fit_transform(X)  \n",
    "\n",
    "\n",
    "def posterior_params_conjugate(X, y, m0, V0, a0, b0):\n",
    "    n, p = X.shape\n",
    "    V0_inv = np.linalg.inv(V0)\n",
    "    XtX = X.T @ X\n",
    "    Xty = X.T @ y\n",
    "    Vn_inv = V0_inv + XtX\n",
    "    Vn = np.linalg.inv(Vn_inv)\n",
    "    mn = Vn @ (V0_inv @ m0 + Xty)\n",
    "    an = a0 + 0.5 * n\n",
    "    # compute here\n",
    "    term = y @ y + m0.T @ V0_inv @ m0 - mn.T @ Vn_inv @ mn\n",
    "    bn = b0 + 0.5 * float(term)\n",
    "    return mn, Vn, an, bn\n",
    "\n",
    "def sample_posterior_linear_conjugate(rng, X, y, m0=None, V0=None, a0=1.0, b0=1.0, T=200):\n",
    "    n, p = X.shape\n",
    "    if m0 is None: m0 = np.zeros(p)\n",
    "    if V0 is None: V0 = np.eye(p) * 10.0 \n",
    "\n",
    "    mn, Vn, an, bn = posterior_params_conjugate(X, y, m0, V0, a0, b0)\n",
    "    sig2 = invgamma_sample(rng, an, bn, size=T)       # (T,)\n",
    "    betas = np.empty((T, p))\n",
    "    L = np.linalg.cholesky(Vn)\n",
    "    zn = rng.standard_normal(size=(T, p))\n",
    "    betas = mn[None, :] + (np.sqrt(sig2)[:, None]) * (zn @ L.T)\n",
    "    return betas, sig2\n",
    "\n",
    "\n",
    "\n",
    "def aoi_conformal_set_for_point(betas, sig2, X_train, y_train, x_star, y_grid, rng):\n",
    "    T = betas.shape[0]\n",
    "    n = X_train.shape[0]\n",
    "    Ny = len(y_grid)\n",
    "\n",
    "    \n",
    "    mu_train = X_train @ betas.T          \n",
    "    mu_train = mu_train.T                \n",
    "   \n",
    "    loglik_train = normal_logpdf(y_train[None, :], mu_train, sig2[:, None])  \n",
    "    E_train = np.exp(np.clip(loglik_train, -700, 50))  \n",
    "\n",
    "    \n",
    "    mu_star = x_star @ betas.T            \n",
    "    loglik_star = normal_logpdf(y_grid[None, :], mu_star[:, None], sig2[:, None])  \n",
    "    f_star = np.exp(np.clip(loglik_star, -700, 50))   \n",
    "    Z = np.sum(f_star, axis=0, keepdims=True)        \n",
    "    W = f_star / (Z + 1e-300)                        \n",
    "\n",
    "   \n",
    "    s_train_mat = (E_train.T @ W)                     \n",
    "\n",
    "    \n",
    "    s_star_vec = (np.sum(f_star * f_star, axis=0) / (Z[0] + 1e-300))  \n",
    "\n",
    "    return s_train_mat, s_star_vec\n",
    "\n",
    "def randomized_full_conformal_mask(s_train_mat, s_star_vec, alpha, rng):\n",
    "    n, Ny = s_train_mat.shape\n",
    "    mask = np.zeros(Ny, dtype=bool)\n",
    "    for j in range(Ny):\n",
    "        sj = s_train_mat[:, j]\n",
    "        sstar = s_star_vec[j]\n",
    "        lt = np.sum(sj < sstar)\n",
    "        eq = np.sum(sj == sstar)\n",
    "        u = rng.uniform()\n",
    "        pval = (1.0 + lt + u*eq) / (n + 1.0)\n",
    "        mask[j] = (pval > alpha)\n",
    "    return mask\n",
    "\n",
    "\n",
    "\n",
    "def run_experiment2_california(\n",
    "    n_list=(100, 300, 600, 1000),\n",
    "    E=5,\n",
    "    alpha=0.2,\n",
    "    grid_size=400,\n",
    "    m_test=100,\n",
    "    T_samples=200,\n",
    "    seed0=random.randint(1,1000000)\n",
    "):\n",
    "   \n",
    "    rng_master = np.random.default_rng(seed0)\n",
    "    X, y, _ = load_california_standardized()\n",
    "\n",
    "    model_specs = [\n",
    "        (\"M1_full\", design_M1_full),\n",
    "        (\"M2_small\", design_M2_small),\n",
    "        (\"M3_quadratic\", design_M3_quadratic),\n",
    "    ]\n",
    "\n",
    "    all_rows = []\n",
    "    t0 = time.time()\n",
    "\n",
    "    for n in n_list:\n",
    "        t_n0 = time.time()\n",
    "\n",
    "        for model_name, design_fn in model_specs:\n",
    "            coverages = []\n",
    "            lengths = []\n",
    "            times_each = []\n",
    "\n",
    "            for rep in range(E):\n",
    "                seed_rep = int(rng_master.integers(0, 2**31-1))\n",
    "                Xtr_raw, ytr, Xte_raw, yte = sample_train_test(X, y, n_train=n, m_test=m_test, seed=seed_rep)\n",
    "\n",
    "                \n",
    "                Xtr = design_fn(Xtr_raw)       \n",
    "                Xte = design_fn(Xte_raw)        \n",
    "                p = Xtr.shape[1]\n",
    "\n",
    "                \n",
    "                m0 = np.zeros(p)\n",
    "                V0 = np.eye(p) * 10.0\n",
    "                a0, b0 = 1.0, 1.0\n",
    "\n",
    "                rng_rep = np.random.default_rng(seed_rep + 99)\n",
    "                # Posterior draw\n",
    "                betas, sig2 = sample_posterior_linear_conjugate(rng_rep, Xtr, ytr, m0=m0, V0=V0, a0=a0, b0=b0, T=T_samples)\n",
    "\n",
    "                \n",
    "                y_grid = np.linspace(ytr.min() - 1.0, ytr.max() + 1.0, grid_size)\n",
    "                dy = y_grid[1] - y_grid[0]\n",
    "\n",
    "                t_model0 = time.time()\n",
    "               \n",
    "                t_model_pre = time.time()\n",
    "\n",
    "                \n",
    "                for j in range(Xte.shape[0]):\n",
    "                    x_star = Xte[j]\n",
    "                    \n",
    "                    s_train_mat, s_star_vec = aoi_conformal_set_for_point(betas, sig2, Xtr, ytr, x_star, y_grid, rng_rep)\n",
    "                    mask = randomized_full_conformal_mask(s_train_mat, s_star_vec, alpha, rng_rep)\n",
    "\n",
    "\n",
    "                    idx_true = int(np.clip(np.searchsorted(y_grid, yte[j]), 0, grid_size-1))\n",
    "                    coverage_flag = bool(mask[idx_true])\n",
    "                    set_length = mask.sum() * dy\n",
    "\n",
    "                    coverages.append(coverage_flag)\n",
    "                    lengths.append(set_length)\n",
    "\n",
    "                times_each.append(time.time() - t_model0)\n",
    "\n",
    "            cov_arr = np.array(coverages).reshape(E, -1).mean(axis=1)  # mean over test points per rep\n",
    "            len_arr = np.array(lengths).reshape(E, -1).mean(axis=1)\n",
    "            tm_arr  = np.array(times_each)\n",
    "\n",
    "            all_rows.append({\n",
    "                \"model\": model_name,\n",
    "                \"n\": n,\n",
    "                \"coverage_mean\": cov_arr.mean(),\n",
    "                \"coverage_se\": cov_arr.std(ddof=1)/np.sqrt(E),\n",
    "                \"length_mean\": len_arr.mean(),\n",
    "                \"length_se\": len_arr.std(ddof=1)/np.sqrt(E),\n",
    "                \"time_model\": round(tm_arr.sum(), 2)\n",
    "            })\n",
    "\n",
    "\n",
    "    total_elapsed = time.time() - t0\n",
    "    res = pd.DataFrame(all_rows).sort_values([\"n\",\"model\"]).reset_index(drop=True)\n",
    "\n",
    "    plt.figure()\n",
    "    for model_name in sorted(res[\"model\"].unique()):\n",
    "        dfm = res[res[\"model\"]==model_name].sort_values(\"n\")\n",
    "        plt.errorbar(dfm[\"n\"], dfm[\"coverage_mean\"], yerr=dfm[\"coverage_se\"], marker=\"o\", capsize=4, label=model_name)\n",
    "    plt.axhline(0.8, linestyle=\"--\", alpha=0.7)\n",
    "    plt.title(\"CB on California Data Average Coverage vs n\")\n",
    "    plt.xlabel(\"n\")\n",
    "    plt.ylabel(\"Coverage Rate\")\n",
    "    plt.legend()\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "    \n",
    "    plt.figure()\n",
    "    for model_name in sorted(res[\"model\"].unique()):\n",
    "        dfm = res[res[\"model\"]==model_name].sort_values(\"n\")\n",
    "        plt.errorbar(dfm[\"n\"], dfm[\"length_mean\"], yerr=dfm[\"length_se\"], marker=\"o\", capsize=4, label=model_name)\n",
    "    plt.title(\"CB on California Data Average Set Length vs n\")\n",
    "    plt.xlabel(\"n\")\n",
    "    plt.ylabel(\"Average Set Length\")\n",
    "    plt.legend()\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "    return res\n",
    "\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    _ = run_experiment2_california(\n",
    "        n_list=(100, 300, 600, 1000),  \n",
    "        E=10,                      \n",
    "        alpha=0.2,                \n",
    "        grid_size=500,           \n",
    "        m_test=100,               \n",
    "        T_samples=2000           \n",
    "    )\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
