{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "QVpBrpMZF38b"
      },
      "outputs": [],
      "source": [
        "# =========================================\n",
        "# MIMIC-IV: AKI causal inference with Notes (LLM hook, single-file)\n",
        "# =========================================\n",
        "from __future__ import annotations\n",
        "\n",
        "# Imports\n",
        "import os, re, json, time, math, datetime as dt\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "from google.colab import drive\n",
        "import statsmodels.api as sm\n",
        "import matplotlib.pyplot as plt\n",
        "from pathlib import Path\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "import gc"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tx2OPQPgSyTE"
      },
      "outputs": [],
      "source": [
        "try:\n",
        "    from lifelines import CoxPHFitter\n",
        "except Exception:\n",
        "    CoxPHFitter = None"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HumyGpvY8qFm"
      },
      "outputs": [],
      "source": [
        "!pip install python-dotenv -q\n",
        "from dotenv import load_dotenv\n",
        "from __future__ import annotations\n",
        "from openai import OpenAI"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VDAWJzr9HkY3",
        "outputId": "34e968b3-1ea8-44ab-dd82-879ad728724a"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {},
          "execution_count": 4
        }
      ],
      "source": [
        "drive.mount('/content/drive')\n",
        "drive_path = '/content/drive/MyDrive/'\n",
        "load_dotenv(drive_path + 'Colab Notebooks/env_config/.env')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OCMzyiFcHuJs"
      },
      "outputs": [],
      "source": [
        "# --- Config ---\n",
        "pd.set_option(\"display.max_rows\", 8)\n",
        "MIMIC_DIR = Path(\"/content/drive/MyDrive/data/mimiciv/3.1/\")\n",
        "HOSP = MIMIC_DIR / \"hosp\"\n",
        "NOTE = MIMIC_DIR / \"note\"\n",
        "ICU  = MIMIC_DIR / \"icu\"\n",
        "OUTD = MIMIC_DIR / \"results_ci\"; OUTD.mkdir(parents=True, exist_ok=True)\n",
        "# 큰 csv.gz는 청크로\n",
        "READ_KW = dict(dtype_backend=\"pyarrow\" , low_memory=False)\n",
        "\n",
        "# ---------- Causal / modeling ----------\n",
        "RANDOM_STATE = 7\n",
        "PS_CLIP      = (1e-3, 1-1e-3)\n",
        "W_TRIM       = (0.01, 0.99)\n",
        "VPT_WINDOW_HOURS = 6\n",
        "COX_PENALIZER = 0.1\n",
        "\n",
        "# ---------- Regex / patterns ----------\n",
        "RX_SCR_LABEL  = \"creatinine\"                        # simple substring (no regex)\n",
        "RX_SCR_FLUID  = r\"\\b(?:serum|blood)\\b\"              # regex\n",
        "RX_MGDL       = r\"\\bmg/dl\\b\"\n",
        "RX_MGL        = r\"\\bmg/l\\b\"\n",
        "RX_VANCO_SUB  = \"vancomycin\"                        # simple substring\n",
        "RX_PTZ        = r\"(?:piperacillin|tazobactam|zosyn)\"\n",
        "RX_EMERG_SUB  = \"EMER\"                              # admission_type contains \"EMER\"\n",
        "\n",
        "# 텍스트 피처(만성=confounder 후보 / 급성=collider 위험 → *_pre로 LLM에서만 사용)\n",
        "TEXT_PATTERNS_CHRONIC = {\n",
        "    \"f_ckd\":       \"chronic kidney disease / ESRD / dialysis history\",\n",
        "    \"f_dm\":        \"diabetes mellitus\",\n",
        "    \"f_hf\":        \"heart failure (any; HFrEF/HFpEF)\",\n",
        "    \"f_liver\":     \"cirrhosis / ESLD / hepatic failure\",\n",
        "    \"f_nephrotox\": \"nephrotoxic baseline meds (NSAIDs, ACEi/ARB, aminoglycosides, calcineurin inhibitors)\",\n",
        "}\n",
        "TEXT_PATTERNS_ACUTE_PRE = {\n",
        "    \"f_sepsis_pre\":   \"sepsis present before or at index (on admission)\",\n",
        "    \"f_shock_pre\":    \"shock/vasopressor need before or at index\",\n",
        "    \"f_contrast_pre\": \"iodinated contrast exposure before index (e.g., pre-admission CTA)\",\n",
        "    \"f_vent_pre\":     \"mechanical ventilation before or at index\",\n",
        "}\n",
        "ALL_CONCEPTS_FOR_LLM = {**TEXT_PATTERNS_CHRONIC, **TEXT_PATTERNS_ACUTE_PRE}\n",
        "\n",
        "# ---------- LLM Config ----------\n",
        "OPENAI_API_KEY = os.getenv(\"OPENAI_API_KEY\") or os.getenv(\"API_KEY\")\n",
        "LLM_MODEL = \"gpt-4o-mini\"\n",
        "MAX_NOTE_CHARS = 15000\n",
        "LLM_BATCH_SIZE = 24\n",
        "LLM_TEMPERATURE = 0.0\n",
        "LLM_MAX_TOKENS = 200           # 짧은 JSON만 반환\n",
        "LLM_TIMEOUT = 60\n",
        "CONFOUNDERS = [\"f_ckd_pre\", \"f_dm_pre\", \"f_hf_pre\", \"f_liver_pre\", \"f_nephrotox_pre\"]\n",
        "\n",
        "# ---------- LLM prompt (pre-treatment only) ----------\n",
        "LLM_PROMPT_TEMPLATE = \"\"\"\n",
        "You are assisting a causal inference study of AKI. Exposure = vancomycin±piperacillin/tazobactam.\n",
        "Your ONLY task: read a discharge note and mark **pre-treatment** (pre-admission or at presentation) risk factors.\n",
        "\n",
        "Rules:\n",
        "- Consider ONLY information existing **before or at presentation** relative to index_time = {index_time_iso}.\n",
        "- DO NOT mark conditions/events clearly arising during hospitalization, hospital course, ICU interventions, inpatient treatments, or discharge meds. Those are potential colliders.\n",
        "- If timing is ambiguous, be conservative and mark 0.\n",
        "- Output a compact ONE-LINE JSON with 0/1 integers. No extra text.\n",
        "\n",
        "Binary variables (confounders of interest):\n",
        "- f_ckd_pre: chronic kidney disease / ESRD / chronic dialysis BEFORE presentation.\n",
        "- f_dm_pre: diabetes mellitus history.\n",
        "- f_hf_pre: heart failure (any phenotype), chronic.\n",
        "- f_liver_pre: cirrhosis or end-stage liver disease (chronic).\n",
        "- f_nephrotox_pre: chronic/home exposure to nephrotoxins (NSAIDs, ACEi/ARB, calcineurin inhibitors, chronic aminoglycosides). Ignore inpatient doses.\n",
        "\n",
        "Return ONLY:\n",
        "{{\n",
        "  \"f_ckd_pre\": 0 or 1,\n",
        "  \"f_dm_pre\": 0 or 1,\n",
        "  \"f_hf_pre\": 0 or 1,\n",
        "  \"f_liver_pre\": 0 or 1,\n",
        "  \"f_nephrotox_pre\": 0 or 1\n",
        "}}\n",
        "\n",
        "Discharge note:\n",
        "---\n",
        "{note_text}\n",
        "---\n",
        "\"\"\".strip()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "-SdBosjB95KC"
      },
      "outputs": [],
      "source": [
        "client = OpenAI(api_key=OPENAI_API_KEY)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "BLlpxnLz9v2F",
        "outputId": "eb13b3d3-e88c-41a1-ef8b-99dcd0f37685"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "discharge_detail columns: ['note_id', 'subject_id', 'field_name', 'field_value', 'field_ordinal']\n",
            "discharge columns: ['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq', 'charttime', 'storetime', 'text']\n"
          ]
        }
      ],
      "source": [
        "tmp = pd.read_csv(NOTE/\"discharge_detail.csv.gz\", nrows=3)\n",
        "print(\"discharge_detail columns:\", list(tmp.columns))\n",
        "tmp2 = pd.read_csv(NOTE/\"discharge.csv.gz\", nrows=3)\n",
        "print(\"discharge columns:\", list(tmp2.columns))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "XeKD-A3GMWS9"
      },
      "outputs": [],
      "source": [
        "# ---------- Utils ----------\n",
        "def _to_datetime_safe(s: pd.Series, fmt: str | None = \"%Y-%m-%d %H:%M:%S\") -> pd.Series:\n",
        "    x = pd.to_datetime(s, errors=\"coerce\", format=fmt)\n",
        "    if x.isna().any():\n",
        "        y = pd.to_datetime(s[x.isna()], errors=\"coerce\")\n",
        "        x.loc[x.isna()] = y\n",
        "    return x\n",
        "\n",
        "def _assert_has(df: pd.DataFrame, cols: list[str], name=\"df\"):\n",
        "    miss = [c for c in cols if c not in df.columns]\n",
        "    if miss:\n",
        "        raise ValueError(f\"[{name}] missing columns: {miss}\")\n",
        "\n",
        "def _smd(x, t, w=None):\n",
        "    x = np.asarray(x, float); t = np.asarray(t, int)\n",
        "    if w is None: w = np.ones_like(t, float)\n",
        "    if (t==1).sum()==0 or (t==0).sum()==0: return np.nan\n",
        "    m1 = np.average(x[t==1], weights=w[t==1]); m0 = np.average(x[t==0], weights=w[t==0])\n",
        "    v1 = np.average((x[t==1]-m1)**2, weights=w[t==1]); v0 = np.average((x[t==0]-m0)**2, weights=w[t==0])\n",
        "    return (m1-m0)/np.sqrt((v1+v0)/2 + 1e-9)\n",
        "\n",
        "def evalue_from_hr(hr, lcl, ucl):\n",
        "    def _ev(x: float) -> float:\n",
        "        return x + math.sqrt(max(x,0)*(max(x,0)-1.0)) if x>1 else 1.0\n",
        "    return (_ev(float(hr)), _ev(float(lcl)) if float(lcl)>1 else 1.0)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Aj8AkVOtKMeL"
      },
      "outputs": [],
      "source": [
        "# =========================\n",
        "# 1) Treatment cohort (V vs VPT)\n",
        "# =========================\n",
        "def build_cohort(hosp: Path) -> pd.DataFrame:\n",
        "    use_cols = [\"subject_id\",\"hadm_id\",\"starttime\",\"stoptime\",\"drug\"]\n",
        "    it = pd.read_csv(hosp/\"prescriptions.csv.gz\",\n",
        "                     usecols=lambda c: c in use_cols,\n",
        "                     chunksize=500_000, low_memory=False)\n",
        "    v_chunks, p_chunks = [], []\n",
        "\n",
        "    for ch in it:\n",
        "        ch[\"starttime\"] = _to_datetime_safe(ch[\"starttime\"])\n",
        "        ch[\"stoptime\"]  = _to_datetime_safe(ch[\"stoptime\"])\n",
        "        dlow = ch[\"drug\"].astype(\"string\").str.lower()\n",
        "\n",
        "        v = ch[dlow.str.contains(RX_VANCO_SUB, na=False, regex=False)]\n",
        "        p = ch[dlow.str.contains(RX_PTZ,       na=False, regex=True)]\n",
        "\n",
        "        v = v.rename(columns={\"starttime\":\"v_start\",\"stoptime\":\"v_stop\"})\n",
        "        p = p.rename(columns={\"starttime\":\"p_start\",\"stoptime\":\"p_stop\"})\n",
        "        v_chunks.append(v[[\"subject_id\",\"hadm_id\",\"v_start\",\"v_stop\"]])\n",
        "        p_chunks.append(p[[\"subject_id\",\"hadm_id\",\"p_start\",\"p_stop\"]])\n",
        "\n",
        "    vanco = pd.concat(v_chunks, ignore_index=True) if v_chunks else \\\n",
        "            pd.DataFrame(columns=[\"subject_id\",\"hadm_id\",\"v_start\",\"v_stop\"])\n",
        "    ptz   = pd.concat(p_chunks, ignore_index=True) if p_chunks else \\\n",
        "            pd.DataFrame(columns=[\"subject_id\",\"hadm_id\",\"p_start\",\"p_stop\"])\n",
        "\n",
        "    v1 = (vanco.sort_values(\"v_start\")\n",
        "                .groupby([\"subject_id\",\"hadm_id\"], as_index=False)\n",
        "                .agg(index_time=(\"v_start\",\"first\")))\n",
        "    p1 = (ptz.sort_values(\"p_start\")\n",
        "              .groupby([\"subject_id\",\"hadm_id\"], as_index=False)\n",
        "              .agg(ptz_time=(\"p_start\",\"first\")))\n",
        "\n",
        "    v1[\"index_time\"] = _to_datetime_safe(v1[\"index_time\"])\n",
        "    p1[\"ptz_time\"]   = _to_datetime_safe(p1[\"ptz_time\"])\n",
        "\n",
        "    cohort = v1.merge(p1, how=\"left\", on=[\"subject_id\",\"hadm_id\"])\n",
        "    cohort = cohort.dropna(subset=[\"index_time\"]).reset_index(drop=True)\n",
        "\n",
        "    # STRICT: ptz_time >= index_time and <= index_time + 6h\n",
        "    cohort[\"vpt_flag\"] = (\n",
        "        cohort[\"ptz_time\"].notna() &\n",
        "        (cohort[\"ptz_time\"] >= cohort[\"index_time\"]) &\n",
        "        (cohort[\"ptz_time\"] <= cohort[\"index_time\"] + pd.Timedelta(hours=VPT_WINDOW_HOURS))\n",
        "    ).astype(int)\n",
        "\n",
        "    for k in (\"subject_id\",\"hadm_id\"):\n",
        "        cohort[k] = cohort[k].astype(\"Int64\")\n",
        "\n",
        "    print(f\"[cohort] N={len(cohort)}  VPT={int(cohort['vpt_flag'].sum())}\")\n",
        "    return cohort\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "etZlQidPKP0b"
      },
      "outputs": [],
      "source": [
        "# =========================\n",
        "# 2) Labs → SCr & AKI labels\n",
        "# =========================\n",
        "def load_scr_itemids(hosp: Path) -> list[int]:\n",
        "    d_lab = pd.read_csv(hosp/\"d_labitems.csv.gz\", **READ_KW)\n",
        "    # 1) strict: label contains 'creatinine' and fluid mentions serum/blood\n",
        "    mask = (\n",
        "        d_lab[\"label\"].astype(\"string\").str.contains(RX_SCR_LABEL, na=False, regex=False, case=False) &\n",
        "        d_lab[\"fluid\"].astype(\"string\").str.contains(RX_SCR_FLUID, na=False, regex=True, case=False)\n",
        "    )\n",
        "    ids = d_lab.loc[mask, \"itemid\"].dropna().astype(\"Int64\").astype(int).unique().tolist()\n",
        "\n",
        "    # 2) fallback: if none, ignore fluid filter (dataset 변형 대응)\n",
        "    if len(ids) == 0:\n",
        "        mask2 = d_lab[\"label\"].astype(\"string\").str.contains(RX_SCR_LABEL, na=False, regex=False, case=False)\n",
        "        ids = d_lab.loc[mask2, \"itemid\"].dropna().astype(\"Int64\").astype(int).unique().tolist()\n",
        "\n",
        "    print(f\"[scr ids] K={len(ids)}  sample: {ids[:3]}\")\n",
        "    return ids\n",
        "\n",
        "def load_scr_timeseries(hosp: Path, cohort: pd.DataFrame, scr_ids: list[int]) -> pd.DataFrame:\n",
        "    if len(scr_ids)==0:\n",
        "        raise RuntimeError(\"No SCr itemids found. Check d_labitems.csv.gz filters.\")\n",
        "\n",
        "    hadm_set = set(cohort[\"hadm_id\"].dropna().astype(int).unique().tolist())\n",
        "    keep = [\"subject_id\",\"hadm_id\",\"itemid\",\"charttime\",\"valuenum\",\"valueuom\"]\n",
        "\n",
        "    it = pd.read_csv(hosp/\"labevents.csv.gz\",\n",
        "                     usecols=lambda c: c in keep,\n",
        "                     chunksize=1_000_000, low_memory=False)\n",
        "    chunks=[]\n",
        "    for ch in it:\n",
        "        ch = ch[ch[\"itemid\"].isin(scr_ids)]\n",
        "        ch = ch[ch[\"hadm_id\"].isin(hadm_set)]\n",
        "        if len(ch)==0:\n",
        "            continue\n",
        "        ch[\"charttime\"] = _to_datetime_safe(ch[\"charttime\"])\n",
        "\n",
        "        u = ch[\"valueuom\"].astype(\"string\").str.lower()\n",
        "        ch[\"scr_mgdl\"] = ch[\"valuenum\"]\n",
        "\n",
        "        m_mgl  = u.str.contains(RX_MGL,  na=False, regex=True)\n",
        "        m_mgdl = u.str.contains(RX_MGDL, na=False, regex=True)\n",
        "\n",
        "        # mg/L → mg/dL\n",
        "        ch.loc[m_mgl, \"scr_mgdl\"] = ch.loc[m_mgl, \"valuenum\"] / 10.0\n",
        "\n",
        "        # 허용 단위만 유지\n",
        "        ch = ch[m_mgdl | m_mgl]\n",
        "        if len(ch):\n",
        "            chunks.append(ch[[\"subject_id\",\"hadm_id\",\"itemid\",\"charttime\",\"scr_mgdl\"]])\n",
        "\n",
        "    df = pd.concat(chunks, ignore_index=True) if chunks else \\\n",
        "         pd.DataFrame(columns=[\"subject_id\",\"hadm_id\",\"itemid\",\"charttime\",\"scr_mgdl\"])\n",
        "\n",
        "    df = df.merge(cohort[[\"subject_id\",\"hadm_id\",\"index_time\",\"vpt_flag\"]],\n",
        "                  on=[\"subject_id\",\"hadm_id\"], how=\"inner\")\n",
        "\n",
        "    if len(df)==0:\n",
        "        print(\"[scr ts] rows=0 (no overlapping SCr rows for cohort)\")\n",
        "        return df\n",
        "\n",
        "    df[\"index_time\"] = _to_datetime_safe(df[\"index_time\"])\n",
        "    df = df.dropna(subset=[\"charttime\",\"index_time\"]).reset_index(drop=True)\n",
        "    df[\"dt\"] = (df[\"charttime\"] - df[\"index_time\"]).dt.total_seconds()/3600.0\n",
        "    df = df.sort_values([\"subject_id\",\"hadm_id\",\"charttime\"]).reset_index(drop=True)\n",
        "    for k in (\"subject_id\",\"hadm_id\"):\n",
        "        df[k] = df[k].astype(\"Int64\")\n",
        "    print(f\"[scr ts] rows={len(df)}\")\n",
        "    return df\n",
        "\n",
        "def label_aki(df: pd.DataFrame, cohort: pd.DataFrame) -> pd.DataFrame:\n",
        "    if len(df)==0:\n",
        "        out = cohort.copy()\n",
        "        out[\"baseline\"] = np.nan\n",
        "        out[\"aki48\"] = 0; out[\"aki7x\"] = 0; out[\"aki\"] = 0\n",
        "        print(f\"[aki] rate=0.000  N={len(out)} (no SCr rows)\")\n",
        "        return out\n",
        "\n",
        "    base = (df[(df[\"dt\"]>=-24) & (df[\"dt\"]<=0)]\n",
        "              .groupby([\"subject_id\",\"hadm_id\"], as_index=False)\n",
        "              .agg(baseline=(\"scr_mgdl\",\"median\")))\n",
        "    out = cohort.merge(base, on=[\"subject_id\",\"hadm_id\"], how=\"left\")\n",
        "\n",
        "    if out[\"baseline\"].isna().any():\n",
        "        first24 = (df[(df[\"dt\"]>=0) & (df[\"dt\"]<=24)]\n",
        "                     .sort_values(\"charttime\")\n",
        "                     .groupby([\"subject_id\",\"hadm_id\"], as_index=False)\n",
        "                     .agg(first24=(\"scr_mgdl\",\"first\")))\n",
        "        out = out.merge(first24, on=[\"subject_id\",\"hadm_id\"], how=\"left\")\n",
        "        out[\"baseline\"] = out[\"baseline\"].fillna(out[\"first24\"])\n",
        "\n",
        "    p48 = (df[(df[\"dt\"]>0) & (df[\"dt\"]<=48)]\n",
        "             .groupby([\"subject_id\",\"hadm_id\"], as_index=False)\n",
        "             .agg(max48=(\"scr_mgdl\",\"max\")))\n",
        "    p7d = (df[(df[\"dt\"]>0) & (df[\"dt\"]<=24*7)]\n",
        "             .groupby([\"subject_id\",\"hadm_id\"], as_index=False)\n",
        "             .agg(max7=(\"scr_mgdl\",\"max\")))\n",
        "\n",
        "    out = out.merge(p48, on=[\"subject_id\",\"hadm_id\"], how=\"left\") \\\n",
        "             .merge(p7d, on=[\"subject_id\",\"hadm_id\"], how=\"left\")\n",
        "\n",
        "    out[\"aki48\"] = (out[\"max48\"] >= (out[\"baseline\"] + 0.3)).astype(int)\n",
        "    out[\"aki7x\"] = (out[\"max7\"]  >= (1.5 * out[\"baseline\"])).astype(int)\n",
        "    out[\"aki\"]   = ((out[\"aki48\"]==1) | (out[\"aki7x\"]==1)).astype(int)\n",
        "\n",
        "    print(f\"[aki] rate={out['aki'].mean():.3f}  N={len(out)}\")\n",
        "    return out"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "R_Xus9OnTEV-"
      },
      "outputs": [],
      "source": [
        "# =========================\n",
        "# 3) Time-to-event window (≤7d)\n",
        "# =========================\n",
        "def build_event_times(df_scr: pd.DataFrame, out: pd.DataFrame, hosp: Path) -> pd.DataFrame:\n",
        "    adm = pd.read_csv(hosp/\"admissions.csv.gz\",\n",
        "                      usecols=[\"subject_id\",\"hadm_id\",\"admittime\",\"dischtime\"],\n",
        "                      parse_dates=[\"admittime\",\"dischtime\"], **READ_KW)\n",
        "    adm[\"admittime\"] = _to_datetime_safe(adm[\"admittime\"])\n",
        "    adm[\"dischtime\"] = _to_datetime_safe(adm[\"dischtime\"])\n",
        "\n",
        "    if len(df_scr)==0:\n",
        "        evt = out[[\"subject_id\",\"hadm_id\",\"index_time\",\"vpt_flag\"]].copy()\n",
        "        evt[\"event_time_48\"] = pd.NaT\n",
        "        evt[\"event_time_7\"]  = pd.NaT\n",
        "        evt[\"event_time\"]    = pd.NaT\n",
        "    else:\n",
        "        tmp48 = (df_scr[(df_scr[\"dt\"]>0) & (df_scr[\"dt\"]<=48)]\n",
        "                 .merge(out[[\"subject_id\",\"hadm_id\",\"baseline\"]],\n",
        "                        on=[\"subject_id\",\"hadm_id\"], how=\"left\"))\n",
        "        hit48 = tmp48[tmp48[\"scr_mgdl\"] >= (tmp48[\"baseline\"] + 0.3)]\n",
        "        ev48 = (hit48.sort_values(\"charttime\")\n",
        "                     .groupby([\"subject_id\",\"hadm_id\"], as_index=False)\n",
        "                     .first()[[\"subject_id\",\"hadm_id\",\"charttime\"]]\n",
        "                     .rename(columns={\"charttime\":\"event_time_48\"}))\n",
        "\n",
        "        tmp7 = (df_scr[(df_scr[\"dt\"]>0) & (df_scr[\"dt\"]<=24*7)]\n",
        "                .merge(out[[\"subject_id\",\"hadm_id\",\"baseline\"]],\n",
        "                       on=[\"subject_id\",\"hadm_id\"], how=\"left\"))\n",
        "        hit7 = tmp7[tmp7[\"scr_mgdl\"] >= (1.5 * tmp7[\"baseline\"])]\n",
        "        ev7 = (hit7.sort_values(\"charttime\")\n",
        "                   .groupby([\"subject_id\",\"hadm_id\"], as_index=False)\n",
        "                   .first()[[\"subject_id\",\"hadm_id\",\"charttime\"]]\n",
        "                   .rename(columns={\"charttime\":\"event_time_7\"}))\n",
        "\n",
        "        evt = (out[[\"subject_id\",\"hadm_id\",\"index_time\",\"vpt_flag\"]]\n",
        "               .merge(ev48, how=\"left\")\n",
        "               .merge(ev7,  how=\"left\"))\n",
        "        evt[\"event_time\"] = evt[[\"event_time_48\",\"event_time_7\"]].min(axis=1)\n",
        "\n",
        "    evt = evt.merge(adm, on=[\"subject_id\",\"hadm_id\"], how=\"left\")\n",
        "    evt[\"censor_limit\"] = evt[\"index_time\"] + pd.Timedelta(days=7)\n",
        "    evt[\"censor_time\"]  = evt[[\"dischtime\",\"censor_limit\"]].min(axis=1)\n",
        "\n",
        "    evt = evt.dropna(subset=[\"index_time\",\"censor_time\"]).reset_index(drop=True)\n",
        "    evt[\"event_observed\"] = (evt[\"event_time\"].notna()) & (evt[\"event_time\"] <= evt[\"censor_time\"])\n",
        "    evt[\"duration_days\"] = (\n",
        "        np.where(\n",
        "            evt[\"event_observed\"],\n",
        "            (evt[\"event_time\"] - evt[\"index_time\"]).dt.total_seconds(),\n",
        "            (evt[\"censor_time\"] - evt[\"index_time\"]).dt.total_seconds()\n",
        "        ) / 86400.0\n",
        "    ).clip(min=0)\n",
        "\n",
        "    for k in (\"subject_id\",\"hadm_id\"):\n",
        "        evt[k] = evt[k].astype(\"Int64\")\n",
        "\n",
        "    print(f\"[tte] rows={len(evt)} events={int(evt['event_observed'].sum())}\")\n",
        "    return evt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_W6V0PVaJ21K"
      },
      "outputs": [],
      "source": [
        "# =========================\n",
        "# 4) Notes → LLM hook (or regex fallback)\n",
        "# =========================\n",
        "def read_discharge_safe(note_dir: Path) -> pd.DataFrame:\n",
        "    raw = pd.read_csv(note_dir/\"discharge.csv.gz\", **READ_KW)\n",
        "    # 컬럼명이 다른 경우를 고려하여 안전 인식\n",
        "    # 기본 세트\n",
        "    cand_sid  = [c for c in raw.columns if c.lower()==\"subject_id\"]\n",
        "    cand_hadm = [c for c in raw.columns if c.lower()==\"hadm_id\"]\n",
        "    cand_text = [c for c in raw.columns if c.lower() in (\"text\",\"note_text\")]\n",
        "    if not (cand_sid and cand_hadm and cand_text):\n",
        "        raise ValueError(f\"[discharge.csv] required columns not found. Have={list(raw.columns)}\")\n",
        "\n",
        "    df = raw[[cand_sid[0], cand_hadm[0], cand_text[0]]].copy()\n",
        "    df.columns = [\"subject_id\",\"hadm_id\",\"text\"]\n",
        "    df[\"text\"] = df[\"text\"].astype(\"string[python]\").fillna(\"\")\n",
        "    for k in (\"subject_id\",\"hadm_id\"): df[k] = df[k].astype(\"Int64\")\n",
        "    return df\n",
        "\n",
        "def _extract_excerpt(text: str, max_chars=MAX_NOTE_CHARS) -> str:\n",
        "    return (text[:max_chars] if isinstance(text,str) else \"\")\n",
        "\n",
        "def _openai_json_call(prompt: str) -> dict:\n",
        "    if OpenAI is None or not OPENAI_API_KEY:\n",
        "        raise RuntimeError(\"OpenAI not available (package or API key).\")\n",
        "    client = OpenAI(api_key=OPENAI_API_KEY)\n",
        "    resp = client.chat.completions.create(\n",
        "        model=LLM_MODEL,\n",
        "        response_format={\"type\":\"json_object\"},\n",
        "        messages=[\n",
        "            {\"role\":\"system\",\"content\":\"Return strict JSON only.\"},\n",
        "            {\"role\":\"user\",\"content\":prompt}\n",
        "        ],\n",
        "        temperature=LLM_TEMPERATURE,\n",
        "    )\n",
        "    return json.loads(resp.choices[0].message.content)\n",
        "\n",
        "def make_llm_extractor_openai(index_time_map: dict, cache_dir: Path|None=None):\n",
        "    if cache_dir: cache_dir.mkdir(parents=True, exist_ok=True)\n",
        "\n",
        "    def llm_fn(df_notes: pd.DataFrame) -> pd.DataFrame:\n",
        "        rows=[]\n",
        "        for (sid, hid), g in df_notes.groupby([\"subject_id\",\"hadm_id\"]):\n",
        "            text = \"\\n\".join(g[\"text\"].astype(str).tolist())\n",
        "            excerpt = _extract_excerpt(text)\n",
        "            # index_time (UTC ISO) — 없는 경우 보수적으로 UNKNOWN 사용 (prompt 보수적 규칙이 0을 유도)\n",
        "            it = index_time_map.get((int(sid), int(hid)))\n",
        "            it_iso = pd.to_datetime(it).tz_localize(\"UTC\").isoformat() if pd.notna(it) else \"UNKNOWN\"\n",
        "            prompt = LLM_PROMPT_TEMPLATE.format(index_time_iso=it_iso, note_text=excerpt)\n",
        "            try:\n",
        "                raw = _openai_json_call(prompt)\n",
        "            except Exception as e:\n",
        "                print(f\"[warn] LLM fail hadm={hid}: {e} -> zero-fill\")\n",
        "                raw = {k:0 for k in CONFOUNDERS}\n",
        "            row = {\"subject_id\":int(sid), \"hadm_id\":int(hid)}\n",
        "            row.update({k:int(raw.get(k,0)) for k in CONFOUNDERS})\n",
        "            rows.append(row)\n",
        "        return pd.DataFrame(rows)\n",
        "\n",
        "    return llm_fn\n",
        "\n",
        "# Regex fallback (chronic only) → collider 회피\n",
        "TEXT_PATTERNS_CHRONIC = {\n",
        "    \"f_ckd_pre\":       r\"\\b(?:ckd|chronic kidney disease|chronic renal (?:failure|insufficiency)|esrd|end[- ]?stage renal disease|hemodialysis|peritoneal dialysis)\\b\",\n",
        "    \"f_dm_pre\":        r\"\\b(?:diabetes|dm|diabetic)\\b\",\n",
        "    \"f_hf_pre\":        r\"\\b(?:heart failure|chf|hfref|hfp ef|hfpef|hf p?ef)\\b\",\n",
        "    \"f_liver_pre\":     r\"\\b(?:cirrhosis|end[- ]?stage liver disease|esld|hepatic failure)\\b\",\n",
        "    \"f_nephrotox_pre\": r\"\\b(?:nsaid|ibuprofen|naproxen|ketorolac|ace inhibitor|acei|arb|gentamicin|amikacin|tobramycin|cyclosporine|tacrolimus)\\b\",\n",
        "}\n",
        "def regex_features_from_text(df_notes: pd.DataFrame) -> pd.DataFrame:\n",
        "    out = df_notes[[\"subject_id\",\"hadm_id\"]].drop_duplicates().copy()\n",
        "    for name, pat in TEXT_PATTERNS_CHRONIC.items():\n",
        "        rx = re.compile(pat, re.I)\n",
        "        hit = df_notes.groupby([\"subject_id\",\"hadm_id\"])[\"text\"] \\\n",
        "                      .apply(lambda s: s.str.contains(rx, na=False).any()) \\\n",
        "                      .astype(int).reset_index(name=name)\n",
        "        out = out.merge(hit, on=[\"subject_id\",\"hadm_id\"], how=\"left\")\n",
        "    for c in out.columns:\n",
        "        if c not in (\"subject_id\",\"hadm_id\"):\n",
        "            out[c] = out[c].fillna(0).astype(int)\n",
        "    return out"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gsYilxRP8Yn2"
      },
      "outputs": [],
      "source": [
        "# =========================\n",
        "# 5) Covariates + PS + Survival models\n",
        "# =========================\n",
        "def build_covariates(hosp: Path, out: pd.DataFrame, note_feat: pd.DataFrame) -> pd.DataFrame:\n",
        "    adm = pd.read_csv(hosp/\"admissions.csv.gz\",\n",
        "                      usecols=[\"subject_id\",\"hadm_id\",\"admittime\",\"dischtime\",\"admission_type\"],\n",
        "                      parse_dates=[\"admittime\",\"dischtime\"], **READ_KW)\n",
        "    pat = pd.read_csv(hosp/\"patients.csv.gz\",\n",
        "                      usecols=[\"subject_id\",\"gender\",\"anchor_age\"], **READ_KW)\n",
        "    adm[\"admittime\"] = _to_datetime_safe(adm[\"admittime\"])\n",
        "    adm[\"dischtime\"] = _to_datetime_safe(adm[\"dischtime\"])\n",
        "\n",
        "    dfc = (out.merge(adm, on=[\"subject_id\",\"hadm_id\"], how=\"left\")\n",
        "              .merge(pat, on=\"subject_id\", how=\"left\")\n",
        "              .merge(note_feat, on=[\"subject_id\",\"hadm_id\"], how=\"left\"))\n",
        "    dfc[\"age\"]      = pd.to_numeric(dfc[\"anchor_age\"], errors=\"coerce\")\n",
        "    dfc[\"sexM\"]     = (dfc[\"gender\"] == \"M\").astype(int)\n",
        "    dfc[\"is_emerg\"] = dfc[\"admission_type\"].astype(\"string\") \\\n",
        "                        .str.contains(RX_EMERG_SUB, na=False, regex=False).astype(int)\n",
        "\n",
        "    for c in CONFOUNDERS:\n",
        "        if c in dfc.columns: dfc[c] = dfc[c].fillna(0).astype(int)\n",
        "    return dfc\n",
        "\n",
        "def fit_ps_and_sw(dfc: pd.DataFrame, covs: list[str],\n",
        "                  ps_clip=PS_CLIP, w_trim=W_TRIM):\n",
        "    covs = [c for c in dict.fromkeys(covs) if c in dfc.columns]\n",
        "    if not covs:\n",
        "        raise ValueError(\"No covariates provided to fit_ps_and_sw.\")\n",
        "    X = dfc[covs].copy()\n",
        "    for c in covs: X[c] = pd.to_numeric(X[c], errors=\"coerce\")\n",
        "    X = X.fillna(X.median(numeric_only=True))\n",
        "\n",
        "    T = dfc[\"vpt_flag\"].astype(int)\n",
        "    ps_model = LogisticRegression(max_iter=400, solver=\"lbfgs\", random_state=RANDOM_STATE)\n",
        "    ps_model.fit(X, T)\n",
        "    ps = np.clip(ps_model.predict_proba(X)[:,1], *ps_clip)\n",
        "\n",
        "    p_t = float(T.mean())\n",
        "    sw = np.where(T==1, p_t/ps, (1-p_t)/(1-ps))\n",
        "    lo, hi = np.quantile(sw, w_trim); sw = np.clip(sw, lo, hi)\n",
        "\n",
        "    ESS = float((sw.sum()**2)/(sw**2).sum())\n",
        "    return X, ps, sw, ESS\n",
        "\n",
        "def cox_results_ipw_and_dr(dfc: pd.DataFrame, sw: np.ndarray, covs: list[str], penalizer=COX_PENALIZER):\n",
        "    if CoxPHFitter is None:\n",
        "        raise RuntimeError(\"lifelines is not installed.\")\n",
        "    d = pd.DataFrame({\n",
        "        \"time\":  pd.to_numeric(dfc[\"duration_days\"], errors=\"coerce\"),\n",
        "        \"event\": dfc[\"event_observed\"].astype(int),\n",
        "        \"treat\": dfc[\"vpt_flag\"].astype(int),\n",
        "        \"sw\":    sw\n",
        "    }).dropna(subset=[\"time\",\"event\",\"treat\",\"sw\"])\n",
        "\n",
        "    if d[\"event\"].sum() == 0:\n",
        "        raise RuntimeError(\"No events in TTE window; cannot fit Cox model.\")\n",
        "\n",
        "    cph_w = CoxPHFitter(penalizer=penalizer)\n",
        "    cph_w.fit(d, duration_col=\"time\", event_col=\"event\", weights_col=\"sw\", robust=True)\n",
        "    iptw_HR  = float(np.exp(cph_w.params_[\"treat\"]))\n",
        "    iptw_LCL, iptw_UCL = np.exp(cph_w.confidence_intervals_.loc[\"treat\"].values)\n",
        "\n",
        "    X = dfc[covs].copy()\n",
        "    for c in covs: X[c] = pd.to_numeric(X[c], errors=\"coerce\")\n",
        "    X = X.fillna(X.median(numeric_only=True))\n",
        "    d2 = pd.concat([d.reset_index(drop=True), X.reset_index(drop=True)], axis=1)\n",
        "\n",
        "    cph_dr = CoxPHFitter(penalizer=penalizer)\n",
        "    cph_dr.fit(d2, duration_col=\"time\", event_col=\"event\", weights_col=\"sw\", robust=True)\n",
        "    dr_HR  = float(np.exp(cph_dr.params_[\"treat\"]))\n",
        "    dr_LCL, dr_UCL = np.exp(cph_dr.confidence_intervals_.loc[\"treat\"].values)\n",
        "\n",
        "    return (iptw_HR, iptw_LCL, iptw_UCL), (dr_HR, dr_LCL, dr_UCL)\n",
        "\n",
        "def evaluate_covset(dfc: pd.DataFrame, covs: list[str], label=\"BASE\"):\n",
        "    covs = [c for c in dict.fromkeys(covs) if c in dfc.columns]\n",
        "    if not covs: raise ValueError(\"No covariates for evaluate_covset.\")\n",
        "\n",
        "    X, ps, sw, ESS = fit_ps_and_sw(dfc, covs)\n",
        "\n",
        "    T = dfc[\"vpt_flag\"].astype(int).values\n",
        "    smd_b = [abs(_smd(X[c].values, T, None)) for c in X.columns]\n",
        "    smd_a = [abs(_smd(X[c].values, T, sw))  for c in X.columns]\n",
        "    meanSMD_b = float(np.nanmean(smd_b)); meanSMD_a = float(np.nanmean(smd_a))\n",
        "\n",
        "    # PS overlap (KS)\n",
        "    try:\n",
        "        from scipy.stats import ks_2samp\n",
        "        KS = float(ks_2samp(ps[T==1], ps[T==0]).statistic)\n",
        "    except Exception:\n",
        "        grid = np.linspace(0,1,200)\n",
        "        c1 = np.searchsorted(np.sort(ps[T==1]), grid, side=\"right\")/max(1,(T==1).sum())\n",
        "        c0 = np.searchsorted(np.sort(ps[T==0]), grid, side=\"right\")/max(1,(T==0).sum())\n",
        "        KS = float(np.max(np.abs(c1-c0)))\n",
        "\n",
        "    (hr_i,l_i,u_i),(hr_d,l_d,u_d) = cox_results_ipw_and_dr(dfc, sw, covs)\n",
        "    E_point, E_CI = evalue_from_hr(hr_i, l_i, u_i)\n",
        "\n",
        "    return {\n",
        "        \"covset\":label, \"k_covs\":len(covs),\n",
        "        \"mean_abs_SMD_before\":meanSMD_b, \"mean_abs_SMD_after\":meanSMD_a,\n",
        "        \"ESS\":ESS, \"KS_PS\":KS,\n",
        "        \"IPTW_HR\":hr_i, \"IPTW_LCL\":l_i, \"IPTW_UCL\":u_i, \"IPTW_CI_width\":u_i-l_i,\n",
        "        \"DR_HR\":hr_d,   \"DR_LCL\":l_d,  \"DR_UCL\":u_d,  \"DR_CI_width\":u_d-l_d,\n",
        "        \"Evalue_point\":E_point, \"Evalue_CI\":E_CI\n",
        "    }"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zrPzUPKIMsxd"
      },
      "outputs": [],
      "source": [
        "# =========================\n",
        "# 6) Main end-to-end\n",
        "# =========================\n",
        "def run_pipeline_with_llm(HOSP: Path, NOTE: Path, use_llm: bool=True):\n",
        "    # 1) Cohort\n",
        "    cohort = build_cohort(HOSP)\n",
        "\n",
        "    # 2) Labs → AKI\n",
        "    scr_ids = load_scr_itemids(HOSP)\n",
        "    df_scr  = load_scr_timeseries(HOSP, cohort, scr_ids)\n",
        "    out     = label_aki(df_scr, cohort)\n",
        "\n",
        "    # 3) TTE\n",
        "    evt = build_event_times(df_scr, out, HOSP)\n",
        "\n",
        "    # 4) Notes\n",
        "    notes = read_discharge_safe(NOTE)\n",
        "\n",
        "    # LLM index_time map (for prompt)\n",
        "    idx_map = {(int(r.subject_id), int(r.hadm_id)): r.index_time\n",
        "               for _, r in out[[\"subject_id\",\"hadm_id\",\"index_time\"]].dropna().iterrows()}\n",
        "\n",
        "    # LLM or regex fallback\n",
        "    if use_llm:\n",
        "        try:\n",
        "            llm_fn = make_llm_extractor_openai(index_time_map=idx_map, cache_dir=OUTD/\"llm_cache\")\n",
        "            note_feat = llm_fn(notes)\n",
        "        except Exception as e:\n",
        "            print(f\"[warn] LLM extractor failed: {e} -> regex(chronic) fallback\")\n",
        "            note_feat = regex_features_from_text(notes)\n",
        "    else:\n",
        "        note_feat = regex_features_from_text(notes)\n",
        "\n",
        "    # 5) Covariates\n",
        "    dfc = build_covariates(HOSP, out, note_feat)\n",
        "    dfc = dfc.merge(evt[[\"subject_id\",\"hadm_id\",\"duration_days\",\"event_observed\"]],\n",
        "                    on=[\"subject_id\",\"hadm_id\"], how=\"left\")\n",
        "    dfc = dfc.dropna(subset=[\"vpt_flag\",\"aki\",\"duration_days\",\"event_observed\"]).reset_index(drop=True)\n",
        "\n",
        "    print(f\"[analytic] N={len(dfc)}  events={int(dfc['event_observed'].sum())}  treated={int(dfc['vpt_flag'].sum())}\")\n",
        "\n",
        "    # 6) Evaluate BASE vs BASE+LLM\n",
        "    base_covs = [\"age\",\"sexM\",\"is_emerg\",\"baseline\"]\n",
        "    llm_covs  = base_covs + [c for c in CONFOUNDERS if c in dfc.columns]\n",
        "\n",
        "    m_base = evaluate_covset(dfc, base_covs, \"BASE\")\n",
        "    m_llm  = evaluate_covset(dfc, llm_covs,  \"BASE+LLM\")\n",
        "\n",
        "    perf = pd.DataFrame([m_base, m_llm])\n",
        "    perf.to_csv(OUTD/\"perf_base_vs_llm.csv\", index=False)\n",
        "\n",
        "    print(\"\\n=== Performance (BASE vs BASE+LLM) ===\")\n",
        "    cols_show = [\n",
        "        \"covset\",\"k_covs\",\n",
        "        \"mean_abs_SMD_before\",\"mean_abs_SMD_after\",\n",
        "        \"ESS\",\"KS_PS\",\n",
        "        \"IPTW_HR\",\"IPTW_LCL\",\"IPTW_UCL\",\n",
        "        \"DR_HR\",\"DR_LCL\",\"DR_UCL\",\n",
        "        \"Evalue_point\",\"Evalue_CI\"\n",
        "    ]\n",
        "    print(perf[cols_show])\n",
        "    print(f\"[saved] {OUTD/'perf_base_vs_llm.csv'}\")\n",
        "\n",
        "    return dfc, perf"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# =========================\n",
        "# RUN\n",
        "# =========================\n",
        "if __name__ == \"__main__\":\n",
        "    use_llm = True  # LLM 사용을 끄려면 False\n",
        "    try:\n",
        "        dfc, perf = run_pipeline_with_llm(HOSP, NOTE, use_llm=use_llm)\n",
        "    except Exception as e:\n",
        "        print(\"[FATAL]\", e)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 454
        },
        "id": "xdJaYD7TSOLy",
        "outputId": "9d1fc180-2df3-41bd-ad0d-29a1ed005033"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "[cohort] N=90327  VPT=7822\n",
            "[scr ids] K=3  sample: [50912, 52024, 52546]\n",
            "[scr ts] rows=1172668\n",
            "[aki] rate=0.175  N=90327\n",
            "[tte] rows=90327 events=15811\n"
          ]
        },
        {
          "output_type": "error",
          "ename": "KeyboardInterrupt",
          "evalue": "",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
            "\u001b[0;32m/tmp/ipython-input-1112224897.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0muse_llm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m  \u001b[0;31m# LLM 사용을 끄려면 False\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m         \u001b[0mdfc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mperf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mrun_pipeline_with_llm\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mHOSP\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNOTE\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_llm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_llm\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"[FATAL]\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/tmp/ipython-input-1812269909.py\u001b[0m in \u001b[0;36mrun_pipeline_with_llm\u001b[0;34m(HOSP, NOTE, use_llm)\u001b[0m\n\u001b[1;32m     25\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m             \u001b[0mllm_fn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_llm_extractor_openai\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex_time_map\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0midx_map\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mOUTD\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;34m\"llm_cache\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m             \u001b[0mnote_feat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mllm_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnotes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     28\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     29\u001b[0m             \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"[warn] LLM extractor failed: {e} -> regex(chronic) fallback\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/tmp/ipython-input-2962363145.py\u001b[0m in \u001b[0;36mllm_fn\u001b[0;34m(df_notes)\u001b[0m\n\u001b[1;32m     49\u001b[0m             \u001b[0mprompt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mLLM_PROMPT_TEMPLATE\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex_time_iso\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mit_iso\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnote_text\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mexcerpt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     50\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 51\u001b[0;31m                 \u001b[0mraw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_openai_json_call\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprompt\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     52\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     53\u001b[0m                 \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"[warn] LLM fail hadm={hid}: {e} -> zero-fill\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/tmp/ipython-input-2962363145.py\u001b[0m in \u001b[0;36m_openai_json_call\u001b[0;34m(prompt)\u001b[0m\n\u001b[1;32m     25\u001b[0m         \u001b[0;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"OpenAI not available (package or API key).\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m     \u001b[0mclient\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOpenAI\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mapi_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mOPENAI_API_KEY\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 27\u001b[0;31m     resp = client.chat.completions.create(\n\u001b[0m\u001b[1;32m     28\u001b[0m         \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mLLM_MODEL\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     29\u001b[0m         \u001b[0mresponse_format\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m\"type\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"json_object\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/openai/_utils/_utils.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    285\u001b[0m                         \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"Missing required argument: {quote(missing[0])}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    286\u001b[0m                 \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 287\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    288\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    289\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m  \u001b[0;31m# type: ignore\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/openai/resources/chat/completions/completions.py\u001b[0m in \u001b[0;36mcreate\u001b[0;34m(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, prompt_cache_key, reasoning_effort, response_format, safety_identifier, seed, service_tier, stop, store, stream, stream_options, temperature, text, tool_choice, tools, top_logprobs, top_p, user, verbosity, web_search_options, extra_headers, extra_query, extra_body, timeout)\u001b[0m\n\u001b[1;32m   1151\u001b[0m     ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n\u001b[1;32m   1152\u001b[0m         \u001b[0mvalidate_response_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse_format\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1153\u001b[0;31m         return self._post(\n\u001b[0m\u001b[1;32m   1154\u001b[0m             \u001b[0;34m\"/chat/completions\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1155\u001b[0m             body=maybe_transform(\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/openai/_base_client.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(self, path, cast_to, body, options, files, stream, stream_cls)\u001b[0m\n\u001b[1;32m   1257\u001b[0m             \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"post\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mto_httpx_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1258\u001b[0m         )\n\u001b[0;32m-> 1259\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mResponseT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcast_to\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream_cls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream_cls\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1260\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1261\u001b[0m     def patch(\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/openai/_base_client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, cast_to, options, stream, stream_cls)\u001b[0m\n\u001b[1;32m    980\u001b[0m             \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    981\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 982\u001b[0;31m                 response = self._client.send(\n\u001b[0m\u001b[1;32m    983\u001b[0m                     \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    984\u001b[0m                     \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_should_stream_response_body\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[1;32m    912\u001b[0m         \u001b[0mauth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_request_auth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mauth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    913\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 914\u001b[0;31m         response = self._send_handling_auth(\n\u001b[0m\u001b[1;32m    915\u001b[0m             \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    916\u001b[0m             \u001b[0mauth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mauth\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_handling_auth\u001b[0;34m(self, request, auth, follow_redirects, history)\u001b[0m\n\u001b[1;32m    940\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    941\u001b[0m             \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 942\u001b[0;31m                 response = self._send_handling_redirects(\n\u001b[0m\u001b[1;32m    943\u001b[0m                     \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    944\u001b[0m                     \u001b[0mfollow_redirects\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfollow_redirects\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_handling_redirects\u001b[0;34m(self, request, follow_redirects, history)\u001b[0m\n\u001b[1;32m    977\u001b[0m                 \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    978\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 979\u001b[0;31m             \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_single_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    980\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    981\u001b[0m                 \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_event_hooks\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"response\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_single_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m   1012\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1013\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mrequest_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1014\u001b[0;31m             \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransport\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1015\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1016\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSyncByteStream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpx/_transports/default.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    248\u001b[0m         )\n\u001b[1;32m    249\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mmap_httpcore_exceptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 250\u001b[0;31m             \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    251\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    252\u001b[0m         \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtyping\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpcore/_sync/connection_pool.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    254\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    255\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_close_connections\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclosing\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 256\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    257\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    258\u001b[0m         \u001b[0;31m# Return the response. Note that in this case we still have to manage\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpcore/_sync/connection_pool.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    234\u001b[0m                 \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    235\u001b[0m                     \u001b[0;31m# Send the request on the assigned connection.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 236\u001b[0;31m                     response = connection.handle_request(\n\u001b[0m\u001b[1;32m    237\u001b[0m                         \u001b[0mpool_request\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    238\u001b[0m                     )\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpcore/_sync/connection.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    101\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    102\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 103\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_connection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    104\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_connect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mRequest\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mNetworkStream\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    134\u001b[0m                 \u001b[0;32mwith\u001b[0m \u001b[0mTrace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"response_closed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    135\u001b[0m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_response_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 136\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    137\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    138\u001b[0m     \u001b[0;31m# Sending the request...\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    104\u001b[0m                     \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    105\u001b[0m                     \u001b[0mtrailing_data\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 106\u001b[0;31m                 ) = self._receive_response_headers(**kwargs)\n\u001b[0m\u001b[1;32m    107\u001b[0m                 trace.return_value = (\n\u001b[1;32m    108\u001b[0m                     \u001b[0mhttp_version\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36m_receive_response_headers\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m    175\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    176\u001b[0m         \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 177\u001b[0;31m             \u001b[0mevent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_receive_event\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    178\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh11\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mResponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    179\u001b[0m                 \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36m_receive_event\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    215\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    216\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mevent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mh11\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNEED_DATA\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 217\u001b[0;31m                 data = self._network_stream.read(\n\u001b[0m\u001b[1;32m    218\u001b[0m                     \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mREAD_NUM_BYTES\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    219\u001b[0m                 )\n",
            "\u001b[0;32m/usr/local/lib/python3.12/dist-packages/httpcore/_backends/sync.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[1;32m    126\u001b[0m         \u001b[0;32mwith\u001b[0m \u001b[0mmap_exceptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc_map\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    127\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msettimeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 128\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_bytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    129\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    130\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mfloat\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/lib/python3.12/ssl.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, buflen, flags)\u001b[0m\n\u001b[1;32m   1230\u001b[0m                     \u001b[0;34m\"non-zero flags not allowed in calls to recv() on %s\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1231\u001b[0m                     self.__class__)\n\u001b[0;32m-> 1232\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuflen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1233\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1234\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuflen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/lib/python3.12/ssl.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m   1103\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1104\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1105\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1106\u001b[0m         \u001b[0;32mexcept\u001b[0m \u001b[0mSSLError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1107\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mSSL_ERROR_EOF\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuppress_ragged_eofs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "background_save": true,
          "base_uri": "https://localhost:8080/"
        },
        "id": "3fQV2fbeMvLO",
        "outputId": "c58f395d-ae97-455b-9780-a35fb393b55c"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "[cohort] N=90327  VPT=7822\n",
            "[scr ids] K=3  sample: [50912, 52024, 52546]\n",
            "[scr ts] rows=1172668\n",
            "[aki] rate=0.175  N=90327\n",
            "[tte] rows=90327 events=15811\n",
            "[warn] LLM fail hadm=22595853: name 'LLM_TEMPERATURE' is not defined -> zero-fill\n",
            "[warn] LLM extractor failed: name 'CONFOUNDERS' is not defined -> regex(chronic) fallback\n",
            "[FATAL] name 'CONFOUNDERS' is not defined\n"
          ]
        }
      ],
      "source": [
        "# =========================\n",
        "# RUN\n",
        "# =========================\n",
        "if __name__ == \"__main__\":\n",
        "    use_llm = True  # LLM 사용을 끄려면 False\n",
        "    try:\n",
        "        dfc, perf = run_pipeline_with_llm(HOSP, NOTE, use_llm=use_llm)\n",
        "    except Exception as e:\n",
        "        print(\"[FATAL]\", e)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "wR0ySBwpWI55"
      },
      "outputs": [],
      "source": [
        "!jupyter nbconvert --to notebook --execute --inplace --allow-errors *.ipynb\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "iSDXePjoTifN"
      },
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jSjKYOraWHdc"
      },
      "source": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "L4",
      "machine_shape": "hm",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}