{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":18491,"status":"ok","timestamp":1768250332020,"user":{"displayName":"D K","userId":"02556183042422178006"},"user_tz":-540},"id":"PdT-NG4N5qb5","outputId":"654be191-97e3-4277-f3f2-1d9a0e78581a"},"outputs":[{"name":"stdout","output_type":"stream","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')\n"]},{"cell_type":"code","execution_count":2,"metadata":{"executionInfo":{"elapsed":2390,"status":"ok","timestamp":1768250334409,"user":{"displayName":"D K","userId":"02556183042422178006"},"user_tz":-540},"id":"sd9GtRv7ljBB"},"outputs":[],"source":["import os, sys, pathlib\n","import numpy as np\n","import pandas as pd\n","\n","# Your existing working directory (same as in your current Colab)\n","WORKDIR = \"/content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods\"\n","\n","\n","os.makedirs(WORKDIR, exist_ok=True)\n","os.chdir(WORKDIR)\n","if WORKDIR not in sys.path:\n","    sys.path.insert(0, WORKDIR)\n","\n","SUPP_DIR = os.path.join(WORKDIR, \"Supplement\")\n","if os.path.isdir(SUPP_DIR) and SUPP_DIR not in sys.path:\n","    sys.path.insert(0, SUPP_DIR)\n","\n","# Ensure 'Supplement' is a package if only estimation.py exists\n","init_path = os.path.join(SUPP_DIR, \"__init__.py\")\n","if os.path.isdir(SUPP_DIR) and not os.path.exists(init_path):\n","    with open(init_path, \"w\", encoding=\"utf-8\") as f:\n","        f.write(\"from .estimation import *\\n\")\n","    print(\"[info] Created Supplement/__init__.py shim\")"]},{"cell_type":"code","execution_count":3,"metadata":{"executionInfo":{"elapsed":5087,"status":"ok","timestamp":1768250339501,"user":{"displayName":"D K","userId":"02556183042422178006"},"user_tz":-540},"id":"unHNZPT5DRjv"},"outputs":[],"source":["!pip install -q rpy2\n","\n","%load_ext rpy2.ipython\n"]},{"cell_type":"code","execution_count":4,"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":249},"executionInfo":{"elapsed":277358,"status":"ok","timestamp":1768250616863,"user":{"displayName":"D K","userId":"02556183042422178006"},"user_tz":-540},"id":"YYDtkA4uDTbC","outputId":"469b3475-74b2-486e-ac32-2d657515df12"},"outputs":[{"data":{"text/plain":["Installing package into ‘/usr/local/lib/R/site-library’\n","(as ‘lib’ is unspecified)\n","also installing the dependencies ‘zoo’, ‘DiceKriging’, ‘lmtest’, ‘sandwich’, ‘RcppEigen’\n","\n","trying URL 'https://cloud.r-project.org/src/contrib/zoo_1.8-15.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/DiceKriging_1.6.1.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/lmtest_0.9-40.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/sandwich_3.1-1.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/RcppEigen_0.3.4.0.2.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/grf_2.5.0.tar.gz'\n","\n","The downloaded source packages are in\n","\t‘/tmp/RtmpXj1r8x/downloaded_packages’\n"]},"metadata":{},"output_type":"display_data"}],"source":["%%R\n","if (!requireNamespace(\"grf\", quietly = TRUE)) {\n","  install.packages(\"grf\", repos = \"https://cloud.r-project.org\")\n","}\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true,"base_uri":"https://localhost:8080/"},"id":"4PzahCHzk-dg"},"outputs":[{"name":"stdout","output_type":"stream","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]},{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.12/dist-packages/torch/__init__.py:1275: UserWarning: torch.set_default_tensor_type() is deprecated as of PyTorch 2.1, please use torch.set_default_dtype() and torch.set_default_device() as alternatives. (Triggered internally at /pytorch/torch/csrc/tensor/python_tensor.cpp:434.)\n","  _C._set_default_tensor_type(t)\n"]},{"name":"stdout","output_type":"stream","text":["Working Directory: /content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods\n","Results Directory: /content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods/Data_and_Results/Estimates\n","\n","[Progress] Processing Simulation 1/1 (Seed: 1)\n"]},{"name":"stderr","output_type":"stream","text":["100%|██████████| 47/47 [3:21:48\u003c00:00, 257.62s/it]\n"," 17%|█▋        | 8/47 [37:32\u003c2:46:54, 256.78s/it]"]}],"source":["# -*- coding: utf-8 -*-\n","\"\"\"\n","Empirical Application: Job Corps (DDMLCT) — GRF version\n","\n","What this script saves (to RESULTS_DIR):\n","1) estimates_GRF_seed{first}_to_seed{last}.csv\n","   - Stage-2 beta(t) estimates for every simulation seed\n","   - plus summary rows: mean, se\n","\n","2) MISE_GRF_seed{first}_to_seed{last}.csv\n","   - MISE per seed (Stage-2 only)\n","   - plus summary rows: mean, se\n","\n","(Optional)\n","3) estimates_stage1_GRF_seed{first}_to_seed{last}.csv\n","   - Stage-1 beta(t) estimates for every simulation seed\n","   - plus summary rows: mean, se\n","\"\"\"\n","\n","import os\n","import sys\n","import pathlib\n","import logging\n","import numpy as np\n","import pandas as pd\n","\n","# --- Environment Setup (Colab optional) ---\n","try:\n","    from google.colab import drive  # type: ignore\n","    drive.mount(\"/content/drive\")\n","    BASE_DIR = pathlib.Path(\n","        \"/content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods\"\n","    )\n","except Exception:\n","    BASE_DIR = pathlib.Path(\".\").resolve()\n","\n","# Add working directory and Supplement package to path\n","sys.path.append(str(BASE_DIR))\n","SUPP_DIR = BASE_DIR / \"Supplement\"\n","if SUPP_DIR.exists():\n","    sys.path.append(str(SUPP_DIR))\n","\n","# Ensure Supplement package is importable\n","try:\n","    import Supplement\n","except ImportError:\n","    # Create __init__.py shim if missing\n","    if SUPP_DIR.exists() and not (SUPP_DIR / \"__init__.py\").exists():\n","        with open(SUPP_DIR / \"__init__.py\", \"w\") as f:\n","            f.write(\"from .estimation import *\\n\")\n","    import Supplement\n","\n","# Import GRF models used in your original GRF pipeline\n","from Supplement.rgrf import regression_forest as RF_grf, regression_forest2 as RF2_grf\n","\n","# Logging Setup\n","logging.basicConfig(level=logging.INFO, format=\"%(asctime)s | %(message)s\", datefmt=\"%H:%M:%S\")\n","logger = logging.getLogger(__name__)\n","\n","# Output Directories (match NN style)\n","RESULTS_DIR = BASE_DIR / \"Data_and_Results\" / \"Estimates\"\n","RESULTS_DIR.mkdir(parents=True, exist_ok=True)\n","\n","print(f\"Working Directory: {BASE_DIR}\")\n","print(f\"Results Directory: {RESULTS_DIR}\")\n","\n","# ===========================================================\n","# 1) Data Loading \u0026 Helpers\n","# ===========================================================\n","def load_jobcorps_data():\n","    emp_dir = BASE_DIR / \"Data_and_Results\"\n","    data_path = emp_dir / \"emp_app.csv\"\n","    semi_path = emp_dir / \"semi-syn data grf.csv\"\n","    h_star_path = emp_dir / \"h_star_grf_empapp.csv\"\n","\n","    logger.info(\"Loading emp_app.csv...\")\n","    data = pd.read_csv(data_path, index_col=0)\n","\n","    # Consistent shuffling (fixed seed)\n","    data = data.sample(frac=1, random_state=20)\n","\n","    # One-hot encoding (match your baseline)\n","    data = pd.concat(\n","        [\n","            data.select_dtypes(exclude=[\"int64\"]),\n","            pd.get_dummies(\n","                data.select_dtypes(include=[\"int64\"]).astype(\"category\"),\n","                drop_first=True,\n","                dtype=float,\n","            ),\n","        ],\n","        axis=1,\n","    )\n","\n","    X = data.drop([\"d\", \"y\"], axis=1)\n","    T = data[\"d\"]\n","    Y_emp = data[\"y\"]\n","\n","    logger.info(\"Loading semi-synthetic components...\")\n","    semi_df = pd.read_csv(semi_path, index_col=0)\n","    if not np.array_equal(semi_df.index.values, data.index.values):\n","        semi_df = semi_df.loc[data.index]\n","\n","    mu_hat = semi_df[\"mu_hat_grf\"].to_numpy()\n","    g = semi_df[\"g_grf\"].to_numpy()\n","\n","    logger.info(\"Loading h_star ground truth...\")\n","    h_star_df = pd.read_csv(h_star_path)\n","\n","    # Keep t grid from file if available\n","    if \"t\" in h_star_df.columns:\n","        t_grid = h_star_df[\"t\"].to_numpy()\n","    else:\n","        # Fallback (should not happen for your empapp file)\n","        t_grid = np.arange(160, 2001, 40)\n","\n","    h_star_vals = h_star_df[\"h_star\"].to_numpy()\n","\n","    return X, T, Y_emp, mu_hat, g, t_grid, h_star_vals\n","\n","\n","def gen_semi_y(mu_hat, g, rng):\n","    n = len(mu_hat)\n","    e = rng.choice([-1.0, 1.0], size=n)\n","    return mu_hat + e * g\n","\n","\n","def mise_against(est_beta, h_star_vals):\n","    return float(np.mean((np.asarray(est_beta) - np.asarray(h_star_vals)) ** 2))\n","\n","\n","def summarize_list(x_list):\n","    arr = np.asarray(x_list, dtype=float)\n","    mean = float(arr.mean()) if len(arr) \u003e 0 else 0.0\n","    std = float(arr.std(ddof=1)) if len(arr) \u003e 1 else 0.0\n","    se = float(std / np.sqrt(len(arr))) if len(arr) \u003e 0 else 0.0\n","    return mean, std, se\n","\n","\n","# ===========================================================\n","# 2) Main Simulation Function (GRF)\n","# ===========================================================\n","def run_grf_simulation(K_runs=100, base_seed=1):\n","    # Load data\n","    X, T, _, mu_hat, g, t_list, h_star_vals = load_jobcorps_data()\n","    n_obs = len(T)\n","\n","    # Match your original hyperparams/config\n","    h_rule = np.std(T) * 3 * (n_obs ** (-0.2))\n","    h_first = 2 * h_rule\n","    L, u = 5, 0.5\n","\n","    # GRF models (same as your GRF script)\n","    model_rf1 = RF_grf()\n","    model_rf2 = RF2_grf()\n","\n","    # Storage\n","    seeds = []\n","    stage1_beta = []\n","    stage2_beta = []\n","    stage2_mise = []\n","\n","    logger.info(f\"Starting {K_runs} simulations (GRF)...\")\n","\n","    for k in range(K_runs):\n","        seed = base_seed + k\n","        seeds.append(seed)\n","\n","        print(\"\", flush=True)\n","        print(f\"[Progress] Processing Simulation {k+1}/{K_runs} (Seed: {seed})\", flush=True)\n","\n","        rng_sim = np.random.default_rng(seed)\n","\n","        # Generate and shuffle (match your original flow)\n","        Y_syn = gen_semi_y(mu_hat, g, rng_sim)\n","        perm = rng_sim.permutation(n_obs)\n","\n","        X_k = X.iloc[perm].reset_index(drop=True)\n","        T_k = T.iloc[perm].reset_index(drop=True)\n","        Y_k = np.asarray(Y_syn[perm], float)\n","\n","        # Stage 1: two bandwidth fits\n","        DDML_Class = Supplement.DDMLCT\n","\n","        m1 = DDML_Class(model_rf1, model_rf2)\n","        m1.fit(X_k, T_k, Y_k, t_list, L, h=h_first, basis=False, standardize=True)\n","\n","        m2 = DDML_Class(model_rf1, model_rf2)\n","        m2.fit(X_k, T_k, Y_k, t_list, L, h=h_first * u, basis=False, standardize=True)\n","\n","        # Bandwidth selection\n","        Bt = (m1.beta - m2.beta) / ((m1.h ** 2) * (1 - (u ** 2)))\n","        h_star_ml = np.mean(((m2.Vt / (4 * (Bt ** 2))) ** 0.2) * (m1.n ** -0.2))\n","\n","        # Stage-1 beta (at h_first)\n","        stage1_beta.append(np.asarray(m1.beta, dtype=float))\n","\n","        # Stage 2: final fit with optimized bandwidth\n","        h_second = 0.8 * h_star_ml\n","        m_final = DDML_Class(model_rf1, model_rf2)\n","        m_final.fit(X_k, T_k, Y_k, t_list, L, h=h_second, basis=False, standardize=True)\n","\n","        beta2 = np.asarray(m_final.beta, dtype=float)\n","        stage2_beta.append(beta2)\n","\n","        mise2 = mise_against(beta2, h_star_vals)\n","        stage2_mise.append(mise2)\n","\n","        if (k + 1) % 10 == 0:\n","            logger.info(f\"Simulation {k+1}/{K_runs} completed.\")\n","\n","    # Print Stage-2 MISE summary (console)\n","    mean2, std2, se2 = summarize_list(stage2_mise)\n","    print(\"\\n\" + \"=\" * 50)\n","    print(f\"RESULTS (GRF, K={K_runs}) - Second Stage Only\")\n","    print(\"=\" * 50)\n","    print(\"Method: GRF\")\n","    print(\"  [Stage 2: h = 0.8 * h_star_ml]\")\n","    print(f\"    Mean MISE : {mean2:.6f}\")\n","    print(f\"    Std MISE  : {std2:.6f}\")\n","    print(f\"    SE MISE   : {se2:.6f}\")\n","    print(\"=\" * 50 + \"\\n\")\n","\n","    return {\n","        \"t_grid\": np.asarray(t_list),\n","        \"h_star\": np.asarray(h_star_vals),\n","        \"stage1_beta_mat\": np.vstack(stage1_beta),\n","        \"stage2_beta_mat\": np.vstack(stage2_beta),\n","        \"mise_list\": stage2_mise,\n","        \"seeds\": seeds,\n","    }\n","\n","\n","# ===========================================================\n","# 3) Execution + Saving\n","# ===========================================================\n","K_SIM = 1\n","BASE_SEED = 1\n","\n","results = run_grf_simulation(K_runs=K_SIM, base_seed=BASE_SEED)\n","\n","first_seed = results[\"seeds\"][0]\n","last_seed = results[\"seeds\"][-1]\n","\n","t_cols = [f\"t_{int(t)}\" for t in results[\"t_grid\"]]\n","\n","# ---- Save Stage-2 beta estimates (MAIN requested output) ----\n","df_beta2 = pd.DataFrame(results[\"stage2_beta_mat\"], columns=t_cols)\n","df_beta2.insert(0, \"seed\", results[\"seeds\"])\n","\n","mean_row2 = pd.DataFrame([results[\"stage2_beta_mat\"].mean(axis=0)], columns=t_cols)\n","mean_row2.insert(0, \"seed\", \"mean\")\n","\n","se_row2 = pd.DataFrame(\n","    [results[\"stage2_beta_mat\"].std(axis=0, ddof=1) / np.sqrt(len(results[\"seeds\"]))],\n","    columns=t_cols,\n",")\n","se_row2.insert(0, \"seed\", \"se\")\n","\n","df_beta2_final = pd.concat([df_beta2, mean_row2, se_row2], ignore_index=True)\n","\n","out_beta2 = RESULTS_DIR / f\"estimates_GRF_seed{first_seed}_to_seed{last_seed}.csv\"\n","df_beta2_final.to_csv(out_beta2, index=False)\n","print(f\"Saved Stage-2 beta estimates to: {out_beta2}\")\n","\n","# ---- Save MISE only ----\n","df_mise = pd.DataFrame({\"seed\": results[\"seeds\"], \"mise\": results[\"mise_list\"]})\n","\n","mise_mean, mise_std, mise_se = summarize_list(results[\"mise_list\"])\n","df_mise_summary = pd.DataFrame(\n","    [\n","        {\"seed\": \"mean\", \"mise\": mise_mean},\n","        {\"seed\": \"se\", \"mise\": mise_se},\n","    ]\n",")\n","\n","df_mise_final = pd.concat([df_mise, df_mise_summary], ignore_index=True)\n","\n","out_mise = RESULTS_DIR / f\"MISE_GRF_seed{first_seed}_to_seed{last_seed}.csv\"\n","df_mise_final.to_csv(out_mise, index=False)\n","print(f\"Saved MISE-only table to: {out_mise}\")\n","\n","logger.info(\"Execution finished.\")\n"]}],"metadata":{"colab":{"name":"","version":""},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}