{"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":20814,"status":"ok","timestamp":1767614657929,"user":{"displayName":"D K","userId":"02556183042422178006"},"user_tz":-540},"id":"uux4cv_XRcX1","outputId":"f8848f15-f5eb-41cc-dc28-3da249cfdc5f"},"outputs":[{"name":"stdout","output_type":"stream","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"gQStIrQtRiQU"},"outputs":[],"source":["import os, sys, pathlib\n","import numpy as np\n","import pandas as pd\n","\n","# Your existing working directory (same as in your current Colab)\n","WORKDIR = \"/content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods\"\n","\n","\n","os.makedirs(WORKDIR, exist_ok=True)\n","os.chdir(WORKDIR)\n","if WORKDIR not in sys.path:\n","    sys.path.insert(0, WORKDIR)\n","\n","SUPP_DIR = os.path.join(WORKDIR, \"Supplement\")\n","if os.path.isdir(SUPP_DIR) and SUPP_DIR not in sys.path:\n","    sys.path.insert(0, SUPP_DIR)\n","\n","# Ensure 'Supplement' is a package if only estimation.py exists\n","init_path = os.path.join(SUPP_DIR, \"__init__.py\")\n","if os.path.isdir(SUPP_DIR) and not os.path.exists(init_path):\n","    with open(init_path, \"w\", encoding=\"utf-8\") as f:\n","        f.write(\"from .estimation import *\\n\")\n","    print(\"[info] Created Supplement/__init__.py shim\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"AdocvFTRRjxr"},"outputs":[],"source":["!pip install -q rpy2\n","\n","%load_ext rpy2.ipython\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"wKSJPrfxSEiq","outputId":"838206fe-382b-4165-a883-c426b16a67e7"},"outputs":[{"data":{"text/plain":["Installing package into ‘/usr/local/lib/R/site-library’\n","(as ‘lib’ is unspecified)\n","also installing the dependencies ‘zoo’, ‘DiceKriging’, ‘lmtest’, ‘sandwich’, ‘RcppEigen’\n","\n","trying URL 'https://cloud.r-project.org/src/contrib/zoo_1.8-15.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/DiceKriging_1.6.1.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/lmtest_0.9-40.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/sandwich_3.1-1.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/RcppEigen_0.3.4.0.2.tar.gz'\n","trying URL 'https://cloud.r-project.org/src/contrib/grf_2.5.0.tar.gz'\n","\n","The downloaded source packages are in\n","\t‘/tmp/RtmpCbNFUM/downloaded_packages’\n"]},"metadata":{},"output_type":"display_data"}],"source":["%%R\n","if (!requireNamespace(\"grf\", quietly = TRUE)) {\n","  install.packages(\"grf\", repos = \"https://cloud.r-project.org\")\n","}\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"background_save":true},"id":"n79DOViDRpWn","outputId":"d3967975-ee64-493f-fd78-cd663923910c"},"outputs":[{"name":"stderr","output_type":"stream","text":["/usr/local/lib/python3.12/dist-packages/torch/__init__.py:1275: UserWarning: torch.set_default_tensor_type() is deprecated as of PyTorch 2.1, please use torch.set_default_dtype() and torch.set_default_device() as alternatives. (Triggered internally at /pytorch/torch/csrc/tensor/python_tensor.cpp:434.)\n","  _C._set_default_tensor_type(t)\n"]},{"name":"stdout","output_type":"stream","text":["Reading /content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods/Empirical Application/emp_app.csv ...\n","Shapes:\n","  X: (4024, 138)\n","  T: (4024,)\n","  Y: (4024,)\n","Fitting GRF for f*(x,a)...\n","Done. Example mu_hat[0], g[0]: 31.951727843049817 -31.951727843049817\n","Saved semi-synthetic base data (with mu_hat_grf, g_grf) to:\n","  /content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods/Empirical Application/semi-syn data grf.csv\n","Computing h^*(t) over grid ...\n","Saved h^*(t) to:\n","  /content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods/Empirical Application/h_star_grf_empapp.csv\n"]}],"source":["# -*- coding: utf-8 -*-\n","\"\"\"\n","Colab script: build semi-synthetic base objects for the Job Corps empirical application (GRF-based).\n","\n","What this script does:\n","  1) Load the empirical application dataset (emp_app.csv) and apply the exact same\n","     preprocessing as the baseline code (fixed shuffle + one-hot for int64 columns).\n","  2) Fit a GRF model for the regression function f*(x, a) using (a = d, x = covariates).\n","  3) Create and save a “semi-synthetic base” CSV that contains:\n","       - mu_hat_grf = f_hat(X_i, T_i) evaluated at observed (X_i, T_i)\n","       - g_grf      = residuals Y_i - mu_hat_grf\n","  4) Compute h*(t) = E_X[f*(X, t)] over a fixed t-grid and save it as a separate CSV.\n","\n","Outputs (saved under EMP_DIR):\n","  - \"semi-syn data grf.csv\"\n","  - \"h_star_grf_empapp.csv\"\n","\"\"\"\n","\n","# --- 1) Imports & paths ---\n","import pathlib\n","import numpy as np\n","import pandas as pd\n","\n","# GRF (R grf wrapper)\n","from Supplement.rgrf import regression_forest as RF_grf\n","\n","# Empirical Application directory\n","EMP_DIR = \"/content/drive/MyDrive/Colab Notebooks/CTE_Baseline/DML_methods/Data_and_Results\"\n","current_path = pathlib.Path(EMP_DIR)\n","\n","# --- 2) Load empirical data + apply baseline preprocessing ---\n","data_path = current_path / \"emp_app.csv\"\n","print(f\"Reading {data_path} ...\")\n","data = pd.read_csv(data_path, index_col=0)\n","\n","# Fixed shuffle (match the baseline empirical script)\n","data = data.sample(frac=1, random_state=20)\n","\n","# One-hot encode int64 columns (match baseline preprocessing)\n","data = pd.concat(\n","    [\n","        data.select_dtypes(exclude=\"int64\"),\n","        pd.get_dummies(\n","            data.select_dtypes(\"int64\").astype(\"category\"),\n","            drop_first=True,\n","            dtype=float,\n","        ),\n","    ],\n","    axis=1,\n",")\n","\n","# Split into covariates X, treatment T, outcome Y (same as baseline)\n","X = data.drop([\"d\", \"y\"], axis=1)\n","T = data[\"d\"]\n","Y = data[\"y\"]\n","\n","print(\"Shapes:\")\n","print(\"  X:\", X.shape)\n","print(\"  T:\", T.shape)\n","print(\"  Y:\", Y.shape)\n","\n","# --- 3) Fit GRF regression for f*(x,a) and construct (mu_hat, g) ---\n","def fit_fhat_semi_grf(X: pd.DataFrame, T: pd.Series, Y: pd.Series):\n","    \"\"\"\n","    Fit a GRF regression model for f*(x,a) using features [a, x].\n","\n","    Returns\n","    -------\n","    grf_model : fitted GRF model object\n","    mu_hat : np.ndarray\n","        Predictions f_hat(X_i, T_i) evaluated at observed (X_i, T_i).\n","    g : np.ndarray\n","        Residuals g_i = Y_i - mu_hat_i.\n","    fhat : callable\n","        Function fhat(x_block, a_scalar) that predicts f_hat(x, a) for:\n","          - x_block : DataFrame of covariates (n x p)\n","          - a_scalar: scalar treatment value a\n","    \"\"\"\n","    # Design matrix for GRF: concatenate treatment column and covariates\n","    X_rf = pd.concat(\n","        [\n","            T.rename(\"d\").reset_index(drop=True),\n","            X.reset_index(drop=True),\n","        ],\n","        axis=1,\n","    )\n","\n","    # rgrf.py is assumed to expose a sklearn-like API\n","    grf_model = RF_grf()\n","\n","    # Fit the GRF model on (a, x) -> y\n","    grf_model.fit(X_rf.values, Y.values)\n","\n","    # Predictions at observed (X_i, T_i)\n","    mu_hat = grf_model.predict(X_rf.values)\n","    g = Y.values - mu_hat\n","\n","    # Predictor fhat(x_block, a_scalar) for arbitrary a on a block of x's\n","    def fhat(x_block: pd.DataFrame, a_scalar: float):\n","        x_block = x_block.reset_index(drop=True)\n","        n = len(x_block)\n","        A = np.full((n, 1), float(a_scalar))\n","        X_pred = np.hstack([A, x_block.values])\n","        return grf_model.predict(X_pred)\n","\n","    return grf_model, mu_hat, g, fhat\n","\n","\n","print(\"Fitting GRF for f*(x,a)...\")\n","grf_model, mu_hat, g, fhat = fit_fhat_semi_grf(X, T, Y)\n","print(\"Done. Example mu_hat[0], g[0]:\", mu_hat[0], g[0])\n","\n","# --- 4) Save semi-synthetic base data (mu_hat_grf, g_grf) ---\n","# Attach mu_hat and g to the processed empirical dataset and save to CSV\n","out_df = data.copy()\n","out_df[\"mu_hat_grf\"] = mu_hat\n","out_df[\"g_grf\"] = g\n","\n","out_name = \"semi-syn data grf.csv\"\n","out_path = current_path / out_name\n","\n","out_df.to_csv(out_path, index=True)\n","print(f\"Saved semi-synthetic base data (with mu_hat_grf, g_grf) to:\\n  {out_path}\")\n","\n","# --- 5) Compute h*(t) = E_X[f*(X,t)] over a fixed t-grid and save ---\n","# t-grid used in the baseline empirical application (adjust if needed)\n","t_list = np.arange(160, 2001, 40)\n","\n","\n","def compute_h_star_over_grid(X: pd.DataFrame, fhat, t_list: np.ndarray):\n","    \"\"\"\n","    Compute h*(t) = E_X[f*(X,t)] on a given t-grid using the fitted fhat.\n","\n","    Parameters\n","    ----------\n","    X : DataFrame\n","        Covariates (n x p).\n","    fhat : callable\n","        Function fhat(X_block, t_scalar) -> predictions (n,).\n","    t_list : np.ndarray\n","        Grid of treatment values.\n","\n","    Returns\n","    -------\n","    np.ndarray\n","        h_star values aligned with t_list.\n","    \"\"\"\n","    h_vals = []\n","    for t in t_list:\n","        vals = fhat(X, t)  # shape (n,)\n","        h_vals.append(np.mean(vals))\n","    return np.array(h_vals)\n","\n","\n","print(\"Computing h^*(t) over grid ...\")\n","h_star_vals = compute_h_star_over_grid(X, fhat, t_list)\n","\n","h_df = pd.DataFrame({\n","    \"t\": t_list,\n","    \"h_star\": h_star_vals,\n","})\n","\n","h_out_name = \"h_star_grf_empapp.csv\"\n","h_out_path = current_path / h_out_name\n","\n","h_df.to_csv(h_out_path, index=False)\n","print(f\"Saved h^*(t) to:\\n  {h_out_path}\")\n"]}],"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyNuvI3Wm5cW2pfizw9Ja1/Z"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}