{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "624fa158",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1de6be73",
   "metadata": {},
   "outputs": [],
   "source": [
    "nanda_df = pd.read_stata(\"../analytic/aux_data/nanda_healthcare_tract_2003-2017_03P.dta\") # This file must be downloaded from https://www.openicpsr.org/openicpsr/project/120907/version/V3/view\n",
    "nanda_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "287c0718",
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_other_df = pd.read_csv(\"../analytic/aux_data/pos_other_Q42018.csv\", low_memory=False) # This file must be downloaded from https://data.nber.org/pos/web_update/orig/\n",
    "with pd.option_context(\"display.max_columns\", 999):\n",
    "    display(pos_other_df.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8409589f",
   "metadata": {},
   "outputs": [],
   "source": [
    "density_columns = [c for c in nanda_df.columns if c.startswith(\"aden_sales_\") or c.startswith(\"popden_sales_\")] # keep features that are normalized per capita (popden_sales_*) or area (aden_sales_*)\n",
    "nanda_filtered = nanda_df.loc[nanda_df[\"year\"] == 2017, [\"tract_fips10\"] + density_columns ].dropna() # most recent year\n",
    "nanda_filtered[\"STATE_CODE\"] = nanda_df[\"tract_fips10\"].str.slice(0, 2)\n",
    "nanda_filtered.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "372fc2d6",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_nanda = nanda_filtered.groupby(\"STATE_CODE\").mean().reset_index().drop(columns=\"tract_fips10\")\n",
    "final_nanda = final_nanda.drop(columns=\"STATE_CODE\")\n",
    "final_nanda.to_csv(\"../analytic/nanda_features.csv\")\n",
    "final_nanda.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50467e2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "cols = [\n",
    "    \"PRVDR_NUM\",\n",
    "    \"STATE_CD\",\n",
    "    \"FIPS_STATE_CD\",\n",
    "    \"ELGBLTY_SW\",\n",
    "    \"PGM_TRMNTN_CD\",\n",
    "    \"CMPLNC_STUS_CD\",\n",
    "    \"GNRL_CNTL_TYPE_CD\",\n",
    "    \"CBSA_URBN_RRL_IND\",\n",
    "    \"GNRL_FAC_TYPE_CD\",\n",
    "]\n",
    "lab_cols = [\n",
    "    \"CRTFCT_TYPE_CD\",\n",
    "    \"FORM_116_ACRDTD_TEST_VOL_CNT\"\n",
    "]\n",
    "\n",
    "provider_cols = [\n",
    "    \"PRVDR_CTGRY_CD\",\n",
    "    \"CRTFD_BED_CNT\",\n",
    "    \"MDCL_SCHL_AFLTN_CD\",\n",
    "    \"PRSNEL_OTHR_CNT\",\n",
    "]\n",
    "# filter on program being active + elibigle for Medicare\n",
    "pos_other_slice = pos_other_df.loc[(pos_other_df[\"ELGBLTY_SW\"] == \"Y\") & (pos_other_df[\"PGM_TRMNTN_CD\"] == 0), cols + provider_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "40905e85",
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_other_slice.columns # double check "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "877b9dda",
   "metadata": {},
   "outputs": [],
   "source": [
    "hosp_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 1) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"1\", \"2\"]))\n",
    "hosp_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 1) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"4\", \"9\"]))\n",
    "hosp_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 1) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"5\", \"6\", \"7\", \"8\", \"10\"]))\n",
    "\n",
    "snf_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"].isin([2, 3, 4, 10])) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"4\", \"5\", \"6\"]))\n",
    "snf_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"].isin([2, 3, 4, 10])) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"1\", \"2\", \"3\", \"13\"]))\n",
    "snf_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"].isin([2, 3, 4, 10])) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"7\", \"8\", \"9\", \"10\", \"11\", \"12\"]))\n",
    "\n",
    "is_snf = pos_other_slice[\"PRVDR_CTGRY_CD\"].isin([2, 3, 4, 10])\n",
    "\n",
    "hha_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 5) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"1\", \"2\", \"3\"]))\n",
    "hha_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 5) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"4\"]))\n",
    "hha_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 5) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"5\", \"6\", \"7\"]))\n",
    "\n",
    "op_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 8) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"1\", \"2\"]))\n",
    "op_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 8) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"6\")\n",
    "op_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 8) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"3\", \"4\"]))\n",
    "\n",
    "esrd_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 9) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"2\")\n",
    "esrd_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 9) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"1\")\n",
    "esrd_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 9) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"3\")\n",
    "\n",
    "intel_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 11) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"1\")\n",
    "intel_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 11) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"2\")\n",
    "intel_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 11) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"3\", \"4\", \"5\", \"6\"]))\n",
    "\n",
    "rhc_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 12) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].str.startswith(\"2\"))\n",
    "rhc_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 12) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].str.startswith(\"1\"))\n",
    "rhc_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 12) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"3\", \"4\", \"5\"]))\n",
    "\n",
    "asc_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 15) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"2\")\n",
    "asc_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 15) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"1\")\n",
    "asc_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 15) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"] == \"3\")\n",
    "\n",
    "hspc_nonprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 16) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"1\", \"2\", \"3\"]))\n",
    "hspc_forprofit = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 16) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"4\", \"5\", \"6\", \"7\"]))\n",
    "hspc_public = (pos_other_slice[\"PRVDR_CTGRY_CD\"] == 16) & (pos_other_slice[\"GNRL_CNTL_TYPE_CD\"].isin([\"8\", \"9\", \"10\", \"11\"]))\n",
    "\n",
    "\n",
    "# total for profit, total nonprofit\n",
    "# total by pr + nonpr by type \n",
    "# total inst. by state (% hosp, % snf, etc)\n",
    "\n",
    "rate_df = pd.concat([\n",
    "    hosp_nonprofit, hosp_forprofit, hosp_public,\n",
    "    snf_nonprofit, snf_forprofit, snf_public,\n",
    "    hha_nonprofit, hha_forprofit, hha_public,\n",
    "    op_nonprofit, op_forprofit, op_public,\n",
    "    esrd_nonprofit, esrd_forprofit, esrd_public,\n",
    "    intel_nonprofit, intel_forprofit, intel_public,\n",
    "    rhc_nonprofit, rhc_forprofit, rhc_public,\n",
    "    asc_nonprofit, asc_forprofit, asc_public,\n",
    "    hspc_nonprofit, hspc_forprofit, hspc_public,\n",
    "    is_snf,\n",
    "], keys=[\n",
    "    \"hospital_np\", \"hospital_pr\", \"hospital_pub\",\n",
    "    \"snf_np\", \"snf_pr\", \"snf_pub\",\n",
    "    \"hha_np\", \"hha_pr\", \"hha_pub\",\n",
    "    \"op_np\", \"op_pr\", \"op_pub\",\n",
    "    \"esrd_np\", \"esrd_pr\", \"esrd_pub\",\n",
    "    \"intel_np\", \"intel_pr\", \"intel_pub\",\n",
    "    \"rhc_np\", \"rhc_pr\", \"rhc_pub\",\n",
    "    \"asc_np\", \"asc_pr\", \"asc_pub\",\n",
    "    \"hspc_np\", \"hspc_pr\", \"hspc_pub\",\n",
    "    \"is_snf\"\n",
    "], axis=1)\n",
    "rate_df\n",
    "\n",
    "total_np = rate_df.loc[:, [c for c in rate_df.columns if c.endswith(\"_np\")]].sum(axis=1)\n",
    "total_p = rate_df.loc[:, [c for c in rate_df.columns if c.endswith(\"_pr\")]].sum(axis=1)\n",
    "total_pub = rate_df.loc[:, [c for c in rate_df.columns if c.endswith(\"_pub\")]].sum(axis=1)\n",
    "\n",
    "\n",
    "total_df = pd.concat([total_np, total_p, total_pub], axis=1, keys=[\"total_np\", \"total_pr\", \"total_pub\"])\n",
    "final_rate_df = pd.concat([rate_df, total_df], axis=1).astype(int)\n",
    "final_rate_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a32094f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "dropped_df = pos_other_slice.select_dtypes(object).drop(columns=[\"PRVDR_NUM\", \"STATE_CD\"])\n",
    "\n",
    "for column in dropped_df.columns:\n",
    "    display(dropped_df[column].value_counts(dropna=False))\n",
    "    \n",
    "compliance_dummies = pd.get_dummies(pos_other_slice[\"CMPLNC_STUS_CD\"], drop_first=True, dtype=int, prefix=\"CMPLNC\")\n",
    "#ownership_dummies = pd.get_dummies(pos_other_slice[\"GNRL_CNTL_TYPE_CD\"], drop_first=False, dtype=int, prefix=\"GNRL_CNTL\")\n",
    "urban_rural_dummies = pd.get_dummies(pos_other_slice[\"CBSA_URBN_RRL_IND\"], drop_first=True, dtype=int, prefix=\"URBN_RRL\")\n",
    "pos_other_slice[\"provider_textname\"] = pos_other_slice[\"PRVDR_CTGRY_CD\"].map({\n",
    "    1: \"hospital\",\n",
    "    2: \"snf\",\n",
    "    3: \"snf\",\n",
    "    4: \"snf\",\n",
    "    5: \"hha\",\n",
    "    6: \"op-pt/st\",\n",
    "    9: \"esrd\",\n",
    "    10: \"snf\",\n",
    "    11: \"icf/id\",\n",
    "    12: \"rhc\",\n",
    "    15: \"asc\",\n",
    "    16: \"hspc\",\n",
    "})\n",
    "provider_dummies = pd.get_dummies(pos_other_slice[\"provider_textname\"], drop_first=False, dtype=int, prefix=\"provider_type\")\n",
    "\n",
    "final_pos_df = pd.concat([pos_other_slice.select_dtypes(exclude=['object']), compliance_dummies, final_rate_df, urban_rural_dummies, provider_dummies], axis=1)\n",
    "final_pos_df = final_pos_df.groupby(\"FIPS_STATE_CD\").mean().reset_index()\n",
    "final_pos_df[\"p_np_ratio\"] = final_pos_df[\"total_pr\"] / final_pos_df[\"total_np\"]\n",
    "final_pos_df = final_pos_df[final_pos_df[\"FIPS_STATE_CD\"] <= 56].drop(columns=[\"FIPS_STATE_CD\", \"PGM_TRMNTN_CD\"]) # the wyoming hack!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "add487d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "final_pos_df.to_csv(\"../analytic/pos.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "87c33406",
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.concat([final_pos_df, final_nanda], axis=1).to_csv(\"../analytic/state_features.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
