{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np \n",
    "import os \n",
    "import sys \n",
    "from tqdm import tqdm\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from scipy.stats import zscore"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read tables\n",
    "dir_path = '/Users//Documents/research/benchmarking-os/'\n",
    "out   = pd.read_csv(os.path.join(dir_path, 'whi/data/data/main_study/csv/outc_adj_bio.csv'))\n",
    "std_trt = pd.read_csv(os.path.join(dir_path, 'whi/data/data/main_study/csv/dem_ctos_bio.csv'))[['ID', 'HRTARM', 'OSFLAG']]\n",
    "hyst    = pd.read_csv(os.path.join(dir_path, 'whi/data/data/main_study/csv/f2_ctos_bio.csv'))[['ID','HYST']]\n",
    "pre_hrt  = pd.read_csv(os.path.join(dir_path, 'whi/data/data/main_study/csv/f43_ctos_bio.csv'))[['ID', 'TOTESTAT','TOTPSTAT']]\n",
    "post_hrt = pd.read_csv(os.path.join(dir_path, 'whi/data/data/main_study/csv/f48_av1_os_pub.csv'))[['ID','ELSTYR','PLSTYR','HRTCMBP']]\n",
    "unc_hf   = pd.read_csv(os.path.join(dir_path, 'whi/data/data/main_study/csv/unc_hf_bio.csv'))[['ID','CHDYRHX','CHDEVERHX','HYPERTNHX','MIHX','PVDHX','DIABHX','STROKEHX']]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# List of outcomes     \n",
    "glbl_list = ['CHD', 'BREAST', 'STROKE', 'PE', 'ENDMTRL', 'COLORECTAL', 'BKHIP', 'DEATH']    \n",
    "other_list = ['PTCA', 'DVT']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# construct os_df \n",
    "selection_flag = 'biased'\n",
    "'''\n",
    "drop_all_excluded: this drops all patients who had hysterectomy OR are on unopposed estrogen; thus, selection, S = 0 and S = 1, is based on censoring only\n",
    "drop_some_excluded: this keeps patients who had hyseterectomy OR are on unopposed estrogen but were past users of combined HRT, assigns them to be S = 0;\n",
    "censored patients are additionally S = 0\n",
    "drop_no_excluded: keeps all patients who had hysterectomy OR are on unopposed estrogen, and assigns them S = 0; censored patients are additionally S = 0\n",
    "'''\n",
    "additional_selection_processing = 'drop_some_excluded' # 'drop_some_excluded', 'drop_no_excluded', 'drop_all_excluded'\n",
    "'''\n",
    "if censored_patients_sel0 = True, then censored patients are additionally S = 0\n",
    "'''\n",
    "censored_patients_sel0 = True \n",
    "\n",
    "os_df = std_trt.drop_duplicates('ID')\n",
    "os_df = os_df[os_df['OSFLAG'] == 'Yes']\n",
    "os_df = os_df.merge(hyst, on='ID', how='left')\n",
    "os_df = os_df.merge(pre_hrt, on='ID', how='left')\n",
    "print(os_df['TOTESTAT'].value_counts())\n",
    "print(os_df['HYST'].value_counts())\n",
    "os_df = os_df.merge(post_hrt, on='ID', how='left')\n",
    "if additional_selection_processing == 'drop_some_excluded': \n",
    "    os_df = os_df.merge(unc_hf, on='ID', how='left')\n",
    "    condition_dict = {\n",
    "        'CHDEVERHX': ('!=', 1.),\n",
    "        'HYPERTNHX': ('!=', 1.),\n",
    "        'MIHX': ('!=', 1.),\n",
    "        'PVDHX': ('!=', 1.),\n",
    "        'DIABHX': ('!=', 1.),\n",
    "        'STROKEHX': ('!=', 1.)\n",
    "    }\n",
    "    condition = (os_df['HYST'] == 'Yes') & ((os_df['TOTPSTAT'] == 'Never used') | (os_df['TOTPSTAT'] == 'Current user'))\n",
    "    os_df = os_df[~condition]\n",
    "    condition2 = (os_df['TOTESTAT'] == 'Current user') & ((os_df['TOTPSTAT'] == 'Never used') | (os_df['TOTPSTAT'] == 'Current user'))\n",
    "    os_df = os_df[~condition2]\n",
    "    os_df['S'] = os_df.apply(\n",
    "        lambda row: 0 if (row['HYST'] == 'Yes' or row['TOTESTAT'] == 'Current user') else 1,\n",
    "        axis=1\n",
    "    )\n",
    "elif additional_selection_processing == 'drop_no_excluded': \n",
    "    # Selected patients\n",
    "    os_df['S'] = os_df.apply(\n",
    "        lambda row: 1 if (row['HYST'] == 'No' and row['TOTESTAT'] in ['Never used', 'Past user']) else 0,\n",
    "        axis=1\n",
    "    )\n",
    "elif additional_selection_processing == 'drop_all_excluded':\n",
    "    os_df = os_df[os_df['HYST'] == 'No']\n",
    "    os_df = os_df[os_df['TOTESTAT'].isin(['Never used', 'Past user'])]\n",
    "    os_df['S'] = 1\n",
    "\n",
    "os_df = os_df.merge(out, on='ID', how='left')\n",
    "\n",
    "# 35551 (control) + 17503 (intervention) = 53054\n",
    "\n",
    "print(os_df[os_df['TOTPSTAT'].isin(['Current user'])].shape)\n",
    "print(os_df[os_df['TOTPSTAT'].isin(['Never used', 'Past user'])].shape)\n",
    "\n",
    "if selection_flag == 'biased': \n",
    "    os_df = os_df[os_df['TOTPSTAT'].isin(['Never used', 'Past user','Current user'])]\n",
    "    os_df['HRTARM'] = os_df['TOTPSTAT'].map({'Current user': 1, 'Never used': 0, 'Past user': 0})\n",
    "elif selection_flag == 'unbiased' or selection_flag == 'manually_biased': \n",
    "    os_df = os_df[os_df['TOTPSTAT'].isin(['Never used', 'Past user','Current user'])]\n",
    "    conditions = [\n",
    "        (((os_df['ELSTYR'] == 'Yes') & (os_df['PLSTYR'] == 'Yes')) | (os_df['HRTCMBP'] == 'Yes')),\n",
    "        ((os_df['ELSTYR'] == 'No') & (os_df['PLSTYR'] == 'No')),\n",
    "        (((os_df['ELSTYR'] == 'Yes') & (os_df['PLSTYR'] == 'No')) | ((os_df['ELSTYR'] == 'No') & (os_df['PLSTYR'] == 'Yes')))\n",
    "    ]\n",
    "    choices = [1, 0, -1]\n",
    "    os_df['HRTGRP'] = np.select(conditions, choices, default=-2)\n",
    "    os_df = os_df[os_df['HRTGRP'] != -2]\n",
    "    os_df['HRTARM'] = (os_df['HRTGRP'] == 1).astype(int)\n",
    "    os_df['S'] = os_df.apply(lambda row: 0 if row['TOTPSTAT'] == 'Current user' else row['S'], axis=1)\n",
    "os_df['OS'] = 1\n",
    "\n",
    "# os_end_day = None\n",
    "os_end_day = 6*365\n",
    "os_df['END_DY'] = os_end_day if os_end_day is not None else os_df['ENDFOLLOWDY']\n",
    "# os_df['END_DY'] = os_df.apply(lambda x: x['DEATHDY'] if x['DEATHDY'] < os_end_day else os_end_day, axis=1)\n",
    "\n",
    "\n",
    "# Process outcomes (same as CT)\n",
    "for i in glbl_list + other_list:\n",
    "    os_df[i+'_E'] = ((os_df[i] == 1) & (os_df[i+'DY'] <= os_df['END_DY'])).astype(int)\n",
    "    os_df[i+'_DY'] = np.where(os_df[i+'_E'] == 1, os_df[i+'DY'], os_df['END_DY'])\n",
    "    os_df[i+'_EDY'] = np.where(os_df[i+'_E'] == 1, os_df[i+'_DY'], np.nan)\n",
    "\n",
    "# Global index\n",
    "os_df['GLBL_E'] = (os_df[[j+'_E' for j in glbl_list]].sum(axis=1) > 0).astype(int)\n",
    "os_df['GLBL_DY'] = np.where(os_df['GLBL_E'] == 1,\n",
    "                            os_df[[j+'_EDY' for j in glbl_list]].min(axis=1),\n",
    "                            os_df[[j+'_DY' for j in glbl_list]].min(axis=1))\n",
    "\n",
    "# Selection variable adjustment\n",
    "for i in glbl_list + other_list:\n",
    "    os_df['S_'+i] = os_df['S']\n",
    "    if censored_patients_sel0: \n",
    "        os_df['S_'+i] = np.where(os_df[i+'DY'] > os_df['END_DY'], 0, os_df['S_'+i])\n",
    "os_df['S_GLBL'] = os_df['S']\n",
    "\n",
    "# Select needed columns\n",
    "os_df = os_df[['ID', 'OS', 'HRTARM'] + \n",
    "                ['S_'+j for j in glbl_list + other_list + ['GLBL']] + \n",
    "                [j+'_E' for j in glbl_list + other_list + ['GLBL']] + \n",
    "                [j+'_DY' for j in glbl_list + other_list + ['GLBL']]]\n",
    "\n",
    "os_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "if selection_flag == 'manually_biased': \n",
    "    # removing age and menopausal status\n",
    "    categorical_features = {\n",
    "        'dem_ctos_bio.csv': {'ETHNIC': True, 'EDUC': True}, \n",
    "        'f80_ctos_bio.csv': {'BMI': False}, \n",
    "        'f34_ctos_bio.csv': {'SMOKING': True}, \n",
    "        'f151_ctos_bio.csv': {'PHYSFUN': False}    \n",
    "    }\n",
    "    \n",
    "    new_feature_dict = { \n",
    "        'dem_ctos_bio.csv': ['ETHNIC_White', \\\n",
    "                             'EDUC_Some post-graduate or professional', \\\n",
    "                             'EDUC_Some college or Associate Degree'],\n",
    "        'f80_ctos_bio.csv': ['BMI'],\n",
    "        'f34_ctos_bio.csv': ['SMOKING_Past Smoker', 'SMOKING_Current Smoker'],\n",
    "        'f151_ctos_bio.csv': ['PHYSFUN']\n",
    "    }\n",
    "else:\n",
    "    categorical_features = {\n",
    "        'dem_ctos_bio.csv': {'AGE': False, 'ETHNIC': True, 'EDUC': True}, \n",
    "        'f80_ctos_bio.csv': {'BMI': False}, \n",
    "        'f34_ctos_bio.csv': {'SMOKING': True}, \n",
    "        'f31_ctos_bio.csv': {'MENO': False}, \n",
    "        'f151_ctos_bio.csv': {'PHYSFUN': False}    \n",
    "    }\n",
    "    \n",
    "    new_feature_dict = { \n",
    "        'dem_ctos_bio.csv': ['AGE', 'ETHNIC_White', \\\n",
    "                             'EDUC_Some post-graduate or professional', \\\n",
    "                             'EDUC_Some college or Associate Degree'],\n",
    "        'f80_ctos_bio.csv': ['BMI'],\n",
    "        'f34_ctos_bio.csv': ['SMOKING_Past Smoker', 'SMOKING_Current Smoker'],\n",
    "        'f31_ctos_bio.csv': ['MENO'],\n",
    "        'f151_ctos_bio.csv': ['PHYSFUN']\n",
    "    }\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas.api.types as ptypes\n",
    "\n",
    "ctos_temp = os_df.copy()\n",
    "# Dictionary to specify which features are categorical\n",
    "\n",
    "# dfs = []  # Store all dataframes to concatenate later\n",
    "new_dir_path = dir_path + 'whi/data/data/main_study/csv'\n",
    "\n",
    "for filename, f_dict in categorical_features.items():\n",
    "    # Read the data\n",
    "    df = pd.read_csv(os.path.join(new_dir_path, filename))\n",
    "    if filename == 'f80_ctos_bio.csv': \n",
    "        df = df.query('F80VTYP == \"Screening\"')\n",
    "    elif filename == 'f151_ctos_bio.csv': \n",
    "        idx = df.groupby('ID')['F151DAYS'].idxmin().reset_index(drop=True)\n",
    "        df = df.loc[idx, :].reset_index(drop=True)[['ID','PHYSFUN']]\n",
    "    # Select needed columns\n",
    "    features = list(f_dict.keys())\n",
    "    df = df[['ID'] + features]\n",
    "    \n",
    "    # Separate ID column\n",
    "    id_col = df['ID']\n",
    "    print(f\"Processed {filename}\")\n",
    "    print(df.shape)\n",
    "\n",
    "    orig_cols = ctos_temp.columns.tolist()\n",
    "    ctos_temp = ctos_temp.merge(df, on='ID', how='left')\n",
    "\n",
    "    # Handle continuous and categorical features separately\n",
    "    cont_features = [f for f in features if not f_dict[f]]\n",
    "    cat_features = [f for f in features if f_dict[f]]\n",
    "    \n",
    "    # Handle continuous features\n",
    "    if cont_features:\n",
    "        cont_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')\n",
    "        ctos_temp[cont_features] = cont_imputer.fit_transform(ctos_temp[cont_features])\n",
    "    \n",
    "    # Handle categorical features\n",
    "    if cat_features:\n",
    "        cat_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')\n",
    "        ctos_temp[cat_features] = cat_imputer.fit_transform(ctos_temp[cat_features])\n",
    "        \n",
    "        # One-hot encode categorical features\n",
    "        ctos_temp = pd.get_dummies(ctos_temp, columns=cat_features, prefix=cat_features)\n",
    "\n",
    "    if filename == 'dem_ctos_bio.csv': \n",
    "        ctos_temp = ctos_temp.rename(columns={'ETHNIC_White (not of Hispanic origin)': 'ETHNIC_White'})\n",
    "\n",
    "    ctos_temp = ctos_temp[orig_cols + new_feature_dict[filename]]\n",
    "\n",
    "ctos_temp = ctos_temp.astype({col: int for col in ctos_temp.select_dtypes(include='bool').columns})\n",
    "display(ctos_temp)    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split \n",
    "\n",
    "df_os = ctos_temp.copy()\n",
    "if selection_flag == 'manually_biased': \n",
    "    predictors = ['ETHNIC_White', 'EDUC_Some post-graduate or professional', \n",
    "          'EDUC_Some college or Associate Degree', 'BMI', 'SMOKING_Past Smoker', \n",
    "          'SMOKING_Current Smoker', 'PHYSFUN'] \n",
    "else: \n",
    "    predictors = ['AGE', 'ETHNIC_White', 'EDUC_Some post-graduate or professional', \n",
    "          'EDUC_Some college or Associate Degree', 'BMI', 'SMOKING_Past Smoker', \n",
    "          'SMOKING_Current Smoker', 'MENO', 'PHYSFUN'] \n",
    "outcome_name = 'CHD' # STROKE, BREAST\n",
    "\n",
    "outcome= outcome_name + '_E'\n",
    "trt    = 'HRTARM'\n",
    "select = f'S_{outcome_name}'\n",
    "\n",
    "drop_columns = [x for x in df_os.columns if x not in predictors + [outcome, trt, 'ID', 'S']]\n",
    "df_os.rename(columns={trt: 'A', outcome: 'Y'}, inplace=True) \n",
    "df_os['S']  = df_os[select]\n",
    "df_os['Y0'] = df_os['Y']\n",
    "df_os['Y1'] = df_os['Y']\n",
    "df_os['R'] = 1 - df_os['OS']\n",
    "df_os.drop(columns=drop_columns, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "display(df_os)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_os_S1 = df_os.query('S == 1')\n",
    "print(df_os_S1.columns)\n",
    "print(df_os_S1['MENO'])\n",
    "print(df_os_S1.query('MENO == 1.').shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# $P(A|X,U)$ (A signal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def print_distribution(df, variable, condition=None, subgroup_name=\"Overall Population\"):\n",
    "    \"\"\"\n",
    "    Print the distribution of a binary variable for a specified subgroup.\n",
    "    \n",
    "    Parameters:\n",
    "    -----------\n",
    "    df : pandas.DataFrame\n",
    "        The dataframe containing the data\n",
    "    variable : str\n",
    "        The name of the binary variable to analyze ('A', 'Y', or 'S')\n",
    "    condition : pandas.Series or None\n",
    "        A boolean mask for filtering the dataframe\n",
    "        If None, returns distribution for entire population\n",
    "    subgroup_name : str\n",
    "        Name of the subgroup for display purposes\n",
    "    \"\"\"\n",
    "    if condition is None:\n",
    "        # Get distribution for entire population\n",
    "        subgroup = df\n",
    "    else:\n",
    "        # Filter using the boolean mask\n",
    "        subgroup = df[condition]\n",
    "    \n",
    "    # Calculate distribution\n",
    "    dist = subgroup[variable].value_counts(normalize=True).sort_index()\n",
    "    \n",
    "    # Get variable label for display\n",
    "    var_labels = {\n",
    "        'A': 'Treatment',\n",
    "        'Y': 'Outcome',\n",
    "        'S': 'Selection'\n",
    "    }\n",
    "    var_label = var_labels.get(variable, variable)\n",
    "    \n",
    "    # Print results\n",
    "    print(f\"\\nDistribution of {var_label} for {subgroup_name}\")\n",
    "    print(\"-\" * 80)\n",
    "    print(f\"Total number of patients: {len(subgroup)}\")\n",
    "    print(f\"{var_label} = 1: {dist.get(1, 0):.2%}\")\n",
    "    print(f\"{var_label} = 0: {dist.get(0, 0):.2%}\")\n",
    "    print(\"-\" * 80)\n",
    "\n",
    "# Example usage:\n",
    "# # For treatment (A) distribution\n",
    "# print_distribution(df_os_S1, 'A', subgroup_name=\"Overall Population\")\n",
    "# print_distribution(df_os_S1, 'A', df_os_S1['AGE'] >= 65, subgroup_name=\"Age >= 65 (older subpopulation)\")\n",
    "# print_distribution(df_os_S1, 'A', (df_os_S1['AGE'] >= 65) & (df_os_S1['BMI'] >= 30), subgroup_name=\"Age >= 65 and BMI >= 30\")\n",
    "# print_distribution(df_os_S1, 'A', df_os_S1['MENO'] < df_os_S1['AGE'], subgroup_name=\"Post-menopausal group\")\n",
    "\n",
    "# # For outcome (Y) distribution\n",
    "# print_distribution(df_os_S1, 'Y', subgroup_name=\"Overall Population\")\n",
    "# print_distribution(df_os_S1, 'Y', df_os_S1['AGE'] >= 65, subgroup_name=\"AGE >= 65\")\n",
    "# print_distribution(df_os_S1, 'Y', (df_os_S1['AGE'] >= 65) & (df_os_S1['BMI'] >= 30), subgroup_name=\"Age >= 65 and BMI >= 30\")\n",
    "# print_distribution(df_os_S1, 'Y', df_os_S1['MENO'] < df_os_S1['AGE'], subgroup_name=\"Post-menopausal group\")\n",
    "\n",
    "# # For selection (S) distribution\n",
    "# print_distribution(df_os, 'S', subgroup_name=\"Overall Population\")\n",
    "# print_distribution(df_os, 'S', df_os['AGE'] >= 65, subgroup_name=\"Age >= 65 (older subpopulation)\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Different sequence of subgroups (generated by LLM) for CHD ==> A, Y, S"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_dist = df_os_S1\n",
    "outcome = 'Y'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Increased RISK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#### AGE BASED PROGRESSION\n",
    "\n",
    "# Baseline: All selected patients\n",
    "print_distribution(df_dist, outcome, subgroup_name=\"Overall Population\")\n",
    "\n",
    "# Older age\n",
    "print_distribution(df_dist, outcome, df_dist['AGE'] >= 65, subgroup_name=\"Age ≥ 65\")\n",
    "\n",
    "# Older age + higher BMI\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] >= 65) & (df_dist['BMI'] >= 30), \n",
    "                  subgroup_name=\"Age ≥ 65, BMI ≥ 30\")\n",
    "\n",
    "# Older age + higher BMI + smoking history\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] >= 65) & (df_dist['BMI'] >= 30) & \n",
    "                  (df_dist['SMOKING_Past Smoker'] == 1), \n",
    "                  subgroup_name=\"Age ≥ 65, BMI ≥ 30, Past Smoker\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Sequence 2: Physical function and age\n",
    "# Low physical function\n",
    "print_distribution(df_dist, outcome, df_dist['PHYSFUN'] < 70, \n",
    "                  subgroup_name=\"Lower Physical Function (< 70)\")\n",
    "\n",
    "# Low physical function + older age\n",
    "print_distribution(df_dist, outcome, (df_dist['PHYSFUN'] < 70) & (df_dist['AGE'] >= 70), \n",
    "                  subgroup_name=\"Lower Physical Function (< 70), Age ≥ 70\")\n",
    "\n",
    "# Low physical function + older age + current smoker\n",
    "print_distribution(df_dist, outcome, (df_dist['PHYSFUN'] < 70) & (df_dist['AGE'] >= 70) & \n",
    "                  (df_dist['SMOKING_Current Smoker'] == 1), \n",
    "                  subgroup_name=\"Lower Physical Function (< 70), Age ≥ 70, Current Smoker\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Sequence 3: Combined risk factors\n",
    "\n",
    "# Very elderly\n",
    "print_distribution(df_dist, outcome, df_dist['AGE'] >= 75, \n",
    "                  subgroup_name=\"Very Elderly (Age ≥ 75)\")\n",
    "\n",
    "# Very elderly + low physical function \n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] >= 75) & (df_dist['PHYSFUN'] < 60), \n",
    "                  subgroup_name=\"Very Elderly (Age ≥ 75), Very Low Physical Function (< 60)\")\n",
    "\n",
    "# Very elderly + low physical function + obesity\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] >= 75) & (df_dist['PHYSFUN'] < 60) & \n",
    "                  (df_dist['BMI'] >= 32), \n",
    "                  subgroup_name=\"Very Elderly (Age ≥ 75), Very Low Physical Function (< 60), Severe Obesity (BMI ≥ 32)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Decreased RISK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Baseline: All selected patients\n",
    "print_distribution(df_dist, outcome, subgroup_name=\"Overall Population\")\n",
    "\n",
    "# Younger age (reduced risk)\n",
    "print_distribution(df_dist, outcome, df_dist['AGE'] < 60, \n",
    "                  subgroup_name=\"Age < 60\")\n",
    "\n",
    "# Younger age + excellent physical function\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & (df_dist['PHYSFUN'] >= 90), \n",
    "                  subgroup_name=\"Age < 60, Excellent Physical Function (≥ 90)\")\n",
    "\n",
    "# Younger age + excellent physical function + healthy BMI\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & (df_dist['PHYSFUN'] >= 90) & \n",
    "                  (df_dist['BMI'] < 25), \n",
    "                  subgroup_name=\"Age < 60, Excellent Physical Function (≥ 90), Healthy BMI (< 25)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Never smokers\n",
    "print_distribution(df_dist, outcome, \n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0),\n",
    "                  subgroup_name=\"Never Smokers\")\n",
    "\n",
    "# Never smokers + higher education\n",
    "print_distribution(df_dist, outcome, \n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1),\n",
    "                  subgroup_name=\"Never Smokers, Higher Education\")\n",
    "\n",
    "# Never smokers + higher education + regular physical activity (inferred by high physical function)\n",
    "print_distribution(df_dist, outcome, \n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1) &\n",
    "                  (df_dist['PHYSFUN'] >= 85),\n",
    "                  subgroup_name=\"Never Smokers, Higher Education, High Physical Function (≥ 85)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Baseline: All selected patients\n",
    "print_distribution(df_dist, outcome, subgroup_name=\"Overall Population\")\n",
    "\n",
    "# Younger age\n",
    "print_distribution(df_dist, outcome, df_dist['AGE'] < 55, \n",
    "                  subgroup_name=\"Age < 55\")\n",
    "\n",
    "# Younger age + very high physical function\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95)\")\n",
    "\n",
    "# Younger age + very high physical function + optimal BMI range\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95) & \n",
    "                  (df_dist['BMI'] >= 21) & (df_dist['BMI'] <= 23), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95), Optimal BMI (21-23)\")\n",
    "\n",
    "# Add never smoker status\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95) & \n",
    "                  (df_dist['BMI'] >= 21) & (df_dist['BMI'] <= 23) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95), Optimal BMI (21-23), Never Smoker\")\n",
    "\n",
    "# Add higher education (post-graduate)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95) & \n",
    "                  (df_dist['BMI'] >= 21) & (df_dist['BMI'] <= 23) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95), Optimal BMI (21-23), Never Smoker, Post-Grad Education\")\n",
    "\n",
    "# Try adding very recent menopause as another factor\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95) & \n",
    "                  (df_dist['BMI'] >= 21) & (df_dist['BMI'] <= 23) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 5), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95), Optimal BMI (21-23), Never Smoker, Post-Grad Education, Recent Menopause (<5 yrs)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Baseline: All selected patients\n",
    "print_distribution(df_dist, outcome, subgroup_name=\"Overall Population\")\n",
    "\n",
    "# 1. First filter by age (strongest predictor)\n",
    "print_distribution(df_dist, outcome, df_dist['AGE'] < 60, \n",
    "                  subgroup_name=\"Age < 60\")\n",
    "\n",
    "# 2. Add physical function\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85), \n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85)\")\n",
    "\n",
    "# 3. Add BMI\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25),\n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85), Healthy BMI (20-25)\")\n",
    "\n",
    "# 4. Add smoking status (both variables)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0),\n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85), Healthy BMI (20-25), Never Smoker\")\n",
    "\n",
    "# 5. Add menopausal status (later menopause)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10),\n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85), Healthy BMI (20-25), Never Smoker, Recent Menopause (<10 yrs)\")\n",
    "\n",
    "# 6. Add higher education (first level)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10) &\n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1),\n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85), Healthy BMI (20-25), Never Smoker, Recent Menopause (<10 yrs), Post-Graduate Education\")\n",
    "\n",
    "# 7. Add higher education (second level) - though this should be mutually exclusive with first level\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10) &\n",
    "                  ((df_dist['EDUC_Some post-graduate or professional'] == 1) | \n",
    "                   (df_dist['EDUC_Some college or Associate Degree'] == 1)),\n",
    "                  subgroup_name=\"Age < 60, High Function (≥ 85), Healthy BMI (20-25), Never Smoker, Recent Menopause (<10 yrs), Higher Education (Any)\")\n",
    "\n",
    "# 8. Add ethnicity (note: may not further decrease risk as white ethnicity has complex relationship with CHD)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10) &\n",
    "                  ((df_dist['EDUC_Some post-graduate or professional'] == 1) | \n",
    "                   (df_dist['EDUC_Some college or Associate Degree'] == 1)) &\n",
    "                  (df_dist['ETHNIC_White'] == 1),\n",
    "                  subgroup_name=\"Age < 60, High Function (≥ 85), Healthy BMI (20-25), Never Smoker, Recent Menopause (<10 yrs), Higher Education (Any), White Ethnicity\")\n",
    "\n",
    "# 9. Complete model including all variables, with combined education\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10) &\n",
    "                  ((df_dist['EDUC_Some post-graduate or professional'] == 1) | \n",
    "                   (df_dist['EDUC_Some college or Associate Degree'] == 1)) &\n",
    "                  (df_dist['ETHNIC_White'] == 1),\n",
    "                  subgroup_name=\"Complete Model (All Protective Factors)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Different sequence of subgroups (generated by LLM) for Breast Cancer ==> A, Y, S"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_dist = df_os\n",
    "outcome = 'S'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Increased RISK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Sequence 1: Age-based progression\n",
    "\n",
    "# Baseline: All selected patients\n",
    "print_distribution(df_dist, outcome, subgroup_name=\"Overall Population\")\n",
    "\n",
    "# Middle-aged to older women\n",
    "print_distribution(df_dist, outcome, df_dist['AGE'] >= 60, \n",
    "                  subgroup_name=\"Age ≥ 60\")\n",
    "\n",
    "# Older women + White ethnicity\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] >= 60) & (df_dist['ETHNIC_White'] == 1), \n",
    "                  subgroup_name=\"Age ≥ 60, White Ethnicity\")\n",
    "\n",
    "# Older + White + higher education (proxy for higher socioeconomic status)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] >= 60) & (df_dist['ETHNIC_White'] == 1) & \n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1), \n",
    "                  subgroup_name=\"Age ≥ 60, White Ethnicity, Higher Education\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Sequence 2: Menopausal status and risk factors\n",
    "\n",
    "# Post-menopausal women (long duration since menopause)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] - df_dist['MENO']) >= 15, \n",
    "                  subgroup_name=\"≥15 Years Post-Menopausal\")\n",
    "\n",
    "# Post-menopausal + higher BMI (risk factor in post-menopausal women)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] - df_dist['MENO'] >= 15) & \n",
    "                  (df_dist['BMI'] >= 28), \n",
    "                  subgroup_name=\"≥15 Years Post-Menopausal, BMI ≥ 28\")\n",
    "\n",
    "# Post-menopausal + higher BMI + smoking history\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] - df_dist['MENO'] >= 15) & \n",
    "                  (df_dist['BMI'] >= 28) & \n",
    "                  ((df_dist['SMOKING_Past Smoker'] == 1) | (df_dist['SMOKING_Current Smoker'] == 1)), \n",
    "                  subgroup_name=\"≥15 Years Post-Menopausal, BMI ≥ 28, Ever Smoker\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Sequence 3: Early menopause and other risk factors\n",
    "\n",
    "# Early menopause (potential risk window)\n",
    "print_distribution(df_dist, outcome, (df_dist['MENO'] < 45) & (df_dist['AGE'] >= 60), \n",
    "                  subgroup_name=\"Early Menopause (< 45), Age ≥ 60\")\n",
    "\n",
    "# Early menopause + higher BMI\n",
    "print_distribution(df_dist, outcome, (df_dist['MENO'] < 45) & (df_dist['AGE'] >= 60) & \n",
    "                  (df_dist['BMI'] >= 30), \n",
    "                  subgroup_name=\"Early Menopause (< 45), Age ≥ 60, BMI ≥ 30\")\n",
    "\n",
    "# Early menopause + higher BMI + White ethnicity\n",
    "print_distribution(df_dist, outcome, (df_dist['MENO'] < 45) & (df_dist['AGE'] >= 60) & \n",
    "                  (df_dist['BMI'] >= 30) & (df_dist['ETHNIC_White'] == 1), \n",
    "                  subgroup_name=\"Early Menopause (< 45), Age ≥ 60, BMI ≥ 30, White Ethnicity\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Sequence 4: Physical function and other factors\n",
    "\n",
    "\n",
    "# Lower physical function (possible indicator of health issues)\n",
    "print_distribution(df_dist, outcome, (df_dist['PHYSFUN'] < 75) & (df_dist['AGE'] >= 65), \n",
    "                  subgroup_name=\"Lower Physical Function (< 75), Age ≥ 65\")\n",
    "\n",
    "# Lower physical function + higher BMI\n",
    "print_distribution(df_dist, outcome, (df_dist['PHYSFUN'] < 75) & (df_dist['AGE'] >= 65) & \n",
    "                  (df_dist['BMI'] >= 27), \n",
    "                  subgroup_name=\"Lower Physical Function (< 75), Age ≥ 65, BMI ≥ 27\")\n",
    "\n",
    "# Lower physical function + higher BMI + early menopause\n",
    "print_distribution(df_dist, outcome, (df_dist['PHYSFUN'] < 75) & (df_dist['AGE'] >= 65) & \n",
    "                  (df_dist['BMI'] >= 27) & (df_dist['MENO'] < 48), \n",
    "                  subgroup_name=\"Lower Physical Function (< 75), Age ≥ 65, BMI ≥ 27, Early Menopause (< 48)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Decreased RISK"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Baseline: All selected patients\n",
    "print_distribution(df_dist, outcome, subgroup_name=\"Overall Population\")\n",
    "\n",
    "# Non-white ethnicity (generally lower breast cancer risk)\n",
    "print_distribution(df_dist, outcome, df_dist['ETHNIC_White'] == 0, \n",
    "                  subgroup_name=\"Non-White Ethnicity\")\n",
    "\n",
    "# Non-white ethnicity + healthy BMI\n",
    "print_distribution(df_dist, outcome, (df_dist['ETHNIC_White'] == 0) & \n",
    "                  (df_dist['BMI'] < 25), \n",
    "                  subgroup_name=\"Non-White Ethnicity, Healthy BMI (< 25)\")\n",
    "\n",
    "# Non-white ethnicity + healthy BMI + never smoker\n",
    "print_distribution(df_dist, outcome, (df_dist['ETHNIC_White'] == 0) & \n",
    "                  (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0),\n",
    "                  subgroup_name=\"Non-White Ethnicity, Healthy BMI (< 25), Never Smoker\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Later menopause (potential protective effect)\n",
    "print_distribution(df_dist, outcome, (df_dist['MENO'] >= 52) & (df_dist['AGE'] < 65), \n",
    "                  subgroup_name=\"Later Menopause (≥ 52), Age < 65\")\n",
    "\n",
    "# Later menopause + medium BMI (not too high, not too low)\n",
    "print_distribution(df_dist, outcome, (df_dist['MENO'] >= 52) & (df_dist['AGE'] < 65) & \n",
    "                  (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25), \n",
    "                  subgroup_name=\"Later Menopause (≥ 52), Age < 65, Moderate BMI (20-25)\")\n",
    "\n",
    "# Later menopause + medium BMI + non-white ethnicity\n",
    "print_distribution(df_dist, outcome, (df_dist['MENO'] >= 52) & (df_dist['AGE'] < 65) & \n",
    "                  (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['ETHNIC_White'] == 0), \n",
    "                  subgroup_name=\"Later Menopause (≥ 52), Age < 65, Moderate BMI (20-25), Non-White Ethnicity\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Baseline: All selected patients\n",
    "print_distribution(df_dist, outcome, subgroup_name=\"Overall Population\")\n",
    "\n",
    "# Younger age\n",
    "print_distribution(df_dist, outcome, df_dist['AGE'] < 55, \n",
    "                  subgroup_name=\"Age < 55\")\n",
    "\n",
    "# Younger age + very high physical function\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95)\")\n",
    "\n",
    "# Younger age + very high physical function + optimal BMI range\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95) & \n",
    "                  (df_dist['BMI'] >= 21) & (df_dist['BMI'] <= 23), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95), Optimal BMI (21-23)\")\n",
    "\n",
    "# Add never smoker status\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95) & \n",
    "                  (df_dist['BMI'] >= 21) & (df_dist['BMI'] <= 23) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95), Optimal BMI (21-23), Never Smoker\")\n",
    "\n",
    "# Add higher education (post-graduate)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95) & \n",
    "                  (df_dist['BMI'] >= 21) & (df_dist['BMI'] <= 23) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95), Optimal BMI (21-23), Never Smoker, Post-Grad Education\")\n",
    "\n",
    "# Try adding very recent menopause as another factor\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 55) & (df_dist['PHYSFUN'] >= 95) & \n",
    "                  (df_dist['BMI'] >= 21) & (df_dist['BMI'] <= 23) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 5), \n",
    "                  subgroup_name=\"Age < 55, Very High Function (≥ 95), Optimal BMI (21-23), Never Smoker, Post-Grad Education, Recent Menopause (<5 yrs)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Baseline: All selected patients\n",
    "print_distribution(df_dist, outcome, subgroup_name=\"Overall Population\")\n",
    "\n",
    "# 1. First filter by age (strongest predictor)\n",
    "print_distribution(df_dist, outcome, df_dist['AGE'] < 60, \n",
    "                  subgroup_name=\"Age < 60\")\n",
    "\n",
    "# 2. Add physical function\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85), \n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85)\")\n",
    "\n",
    "# 3. Add BMI\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25),\n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85), Healthy BMI (20-25)\")\n",
    "\n",
    "# 4. Add smoking status (both variables)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0),\n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85), Healthy BMI (20-25), Never Smoker\")\n",
    "\n",
    "# 5. Add menopausal status (later menopause)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10),\n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85), Healthy BMI (20-25), Never Smoker, Recent Menopause (<10 yrs)\")\n",
    "\n",
    "# 6. Add higher education (first level)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10) &\n",
    "                  (df_dist['EDUC_Some post-graduate or professional'] == 1),\n",
    "                  subgroup_name=\"Age < 60, High Physical Function (≥ 85), Healthy BMI (20-25), Never Smoker, Recent Menopause (<10 yrs), Post-Graduate Education\")\n",
    "\n",
    "# 7. Add higher education (second level) - though this should be mutually exclusive with first level\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10) &\n",
    "                  ((df_dist['EDUC_Some post-graduate or professional'] == 1) | \n",
    "                   (df_dist['EDUC_Some college or Associate Degree'] == 1)),\n",
    "                  subgroup_name=\"Age < 60, High Function (≥ 85), Healthy BMI (20-25), Never Smoker, Recent Menopause (<10 yrs), Higher Education (Any)\")\n",
    "\n",
    "# 8. Add ethnicity (note: may not further decrease risk as white ethnicity has complex relationship with CHD)\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10) &\n",
    "                  ((df_dist['EDUC_Some post-graduate or professional'] == 1) | \n",
    "                   (df_dist['EDUC_Some college or Associate Degree'] == 1)) &\n",
    "                  (df_dist['ETHNIC_White'] == 1),\n",
    "                  subgroup_name=\"Age < 60, High Function (≥ 85), Healthy BMI (20-25), Never Smoker, Recent Menopause (<10 yrs), Higher Education (Any), White Ethnicity\")\n",
    "\n",
    "# 9. Complete model including all variables, with combined education\n",
    "print_distribution(df_dist, outcome, (df_dist['AGE'] < 60) & \n",
    "                  (df_dist['PHYSFUN'] >= 85) & (df_dist['BMI'] >= 20) & (df_dist['BMI'] < 25) &\n",
    "                  (df_dist['SMOKING_Current Smoker'] == 0) & (df_dist['SMOKING_Past Smoker'] == 0) &\n",
    "                  ((df_dist['AGE'] - df_dist['MENO']) < 10) &\n",
    "                  ((df_dist['EDUC_Some post-graduate or professional'] == 1) | \n",
    "                   (df_dist['EDUC_Some college or Associate Degree'] == 1)) &\n",
    "                  (df_dist['ETHNIC_White'] == 1),\n",
    "                  subgroup_name=\"Complete Model (All Protective Factors)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "benchmarking-os-env-kernel",
   "language": "python",
   "name": "benchmarking-os-env-kernel"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
