{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c2e3da53-beab-4a35-9201-78cdc327d95e",
   "metadata": {},
   "source": [
    "# Description \n",
    "This notebook combines data from CMS Hospital Compare. The data is downlaodable as separate csv files. This data dictionary page lists and describes all available files through the CMS Hospital Compare data repository: https://data.cms.gov/provider-data/dataset/dgmq-aat3#data-dictionary\n",
    "\n",
    "The subset of measures used by Cal Hospital Compare can be found here: https://calhospitalcompare.org/wp-content/uploads/2025/04/Methodology-Primer_Patient-Safety-Honor-Roll_Cal-Hospital-Compare_2024-1.pdf\n",
    "\n",
    "Before running this notebook: Download each of the following csv files from the CMS Hospital Compare provider data repository: https://data.cms.gov/provider-data/\n",
    "* Healthcare_Associated_Infections-Hospital.csv\n",
    "* Complications_and_Deaths-Hospital.csv\n",
    "* Timely_and_Effective_Care-Hospital.csv\n",
    "* HCAHPS-Hospital.csv\n",
    "* Maternal_Health-Hospital.csv\n",
    "* Unplanned_Hospital_Visits-Hospital.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6d9381c4-7394-48d8-a841-87a2d480fd11",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "import math\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import random\n",
    "import selection_algorithms"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bcbe1d17-702b-4e8e-ba81-8ff03ebaddde",
   "metadata": {},
   "source": [
    "# Helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "0aed0339-ccd2-4921-8e8a-b2098d3d637a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "def convert_cms_to_scores_df(input_df, measure_ids, measure_id_column='Measure ID', score_column='Score'):\n",
    "    \"\"\"Converts a CMS formatted dataframe to a dataframe with score columns.\n",
    "    \n",
    "    Args:\n",
    "      input_df: DataFrame with Facility ID column, Measure ID column, and Score column.\n",
    "      measure_ids: List of Measure ID strings to keep as columns in the output scores_df.\n",
    "      \n",
    "    Returns:\n",
    "      scores_df: dataframe with Facility ID and measure_ids as columns, where values are the Score.\n",
    "    \"\"\"\n",
    "    filtered_columns = ['Facility ID','Facility Name', 'State', measure_id_column,score_column]\n",
    "    input_df_filtered = input_df.loc[input_df[measure_id_column].isin(measure_ids)][filtered_columns]\n",
    "    output_df = input_df_filtered.pivot(index=['Facility ID', 'Facility Name', 'State'], columns=measure_id_column, values=score_column)\n",
    "    output_df = output_df.replace('Not Available', np.nan)\n",
    "    return output_df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "42c628f6-d61c-4980-bded-07c9dfe41d50",
   "metadata": {},
   "source": [
    "# Load datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "af01e9a2-06d3-45db-a1e6-a17a45aad70c",
   "metadata": {},
   "source": [
    "## Healthcare_Associated_Infections-Hospital.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "38531f94-1d09-450f-a661-a496033bfe03",
   "metadata": {},
   "outputs": [],
   "source": [
    "hai_df = pd.read_csv('Healthcare_Associated_Infections-Hospital.csv')\n",
    "HAI_COLUMNS = [('HAI_%d_SIR' % (i)) for i in range(1,7)]\n",
    "hai_df = convert_cms_to_scores_df(hai_df, HAI_COLUMNS)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d96829a3-cde5-4bcf-af09-9b057ce2bd3e",
   "metadata": {},
   "source": [
    "## Complications_and_Deaths-Hospital.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "b25a3b60-f39f-4634-854f-6b32876c35c1",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "psi_df_raw = pd.read_csv('Complications_and_Deaths-Hospital.csv')\n",
    "PSI_COLUMNS = [('PSI_%02d' % (i)) for i in range(3,16)] + ['PSI_90', 'COMP_HIP_KNEE', 'MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 'MORT_30_PN', 'MORT_30_STK'] \n",
    "psi_df = convert_cms_to_scores_df(psi_df_raw, PSI_COLUMNS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ce43b7f-6d4f-45d0-b6e6-58adeb64ccca",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "joined_df = hai_df.join(psi_df, on=['Facility ID', 'Facility Name', 'State'], how='inner')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0e90d09-c76e-4aec-9ecf-f77023db802e",
   "metadata": {},
   "source": [
    "## Timely_and_Effective_Care-Hospital.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "c9b92156-f838-4e1a-a158-188e2158db0a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "sepsis_df_raw = pd.read_csv('Timely_and_Effective_Care-Hospital.csv')\n",
    "SEPSIS_COLUMNS = ['SEP_1', 'OP_23', 'OP_29'] #'HCP_COVID_19'\n",
    "sepsis_df = convert_cms_to_scores_df(sepsis_df_raw, SEPSIS_COLUMNS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "978f7dc8-cdd9-42d3-809f-557c666100ef",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "joined_df_2 = joined_df.join(sepsis_df, on=['Facility ID', 'Facility Name', 'State'], how='inner')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "98a3dcad-159f-4ca2-ba68-b9796d7a0e0e",
   "metadata": {},
   "source": [
    "## HCAHPS-Hospital.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf912449-22d3-4979-a1fd-1d26d106c8fa",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "survey_df_raw = pd.read_csv('HCAHPS-Hospital.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "a237bf64-8a9b-4d71-9feb-9ae735ff7ea3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "SURVEY_COLUMNS = ['H_COMP_1_A_P', 'H_COMP_2_A_P', 'H_COMP_3_A_P', 'H_COMP_5_A_P', 'H_COMP_7_SA', 'H_COMP_6_Y_P']\n",
    "survey_df = convert_cms_to_scores_df(survey_df_raw, SURVEY_COLUMNS, measure_id_column='HCAHPS Measure ID', score_column='HCAHPS Answer Percent')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f7b49444-a353-43ba-840b-734f1f087f15",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "joined_df_3 = joined_df_2.join(survey_df, on=['Facility ID', 'Facility Name', 'State'], how='inner')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fa12336-6fd4-47d8-8d4a-f61ec9bc6f9a",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Maternal_Health-Hospital.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "8f2b228e-9012-4667-a91a-8414f9186f7f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "mat_df_raw = pd.read_csv('Maternal_Health-Hospital.csv', dtype={'Facility ID': str}) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "5be58701-5992-40d9-8836-2a5e53883b54",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "MAT_COLUMNS = ['ePC_07a', 'ePC_07b']\n",
    "mat_df = convert_cms_to_scores_df(mat_df_raw, MAT_COLUMNS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ff0115e-8390-4f31-ae67-88f6ecae6df7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "joined_df_4 = joined_df_3.join(mat_df, on=['Facility ID', 'Facility Name', 'State'], how='inner')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bc1d2d10-0b8b-4ebd-8407-67510d4889bb",
   "metadata": {},
   "source": [
    "## Unplanned_Hospital_Visits-Hospital.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2565995-f532-43d5-9439-435cb614c649",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "unp_df_raw = pd.read_csv('Unplanned_Hospital_Visits-Hospital.csv', dtype={'Facility ID': str}) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "id": "841b96db-c60b-422a-8faf-c3c300b657b3",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "UNP_COLUMNS = ['OP_32', 'OP_36', 'OP_35_ADM', 'OP_35_ED', \n",
    "               'READM_30_AMI', 'READM_30_COPD', 'READM_30_CABG', \n",
    "               'READM_30_HF', 'READM_30_HIP_KNEE', 'READM_30_PN', \n",
    "               'READM_30_HOSP_WIDE', 'EDAC_30_AMI', 'EDAC_30_HF', 'EDAC_30_PN']\n",
    "unp_df = convert_cms_to_scores_df(unp_df_raw, UNP_COLUMNS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2060aa04-9541-441d-ad0a-818989262649",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "joined_df_5 = joined_df_4.join(unp_df, on=['Facility ID', 'Facility Name', 'State'], how='inner')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "85bb8231-a252-4360-a598-3cf58e2f57c7",
   "metadata": {},
   "source": [
    "# Filter to only include hospitals eligible for Cal Hospital Compare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "id": "ddbe555b-84bc-48cd-9f90-97a02a5b02b0",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "joined_df_5 = joined_df_5.apply(pd.to_numeric)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "id": "927108f7-decb-472d-a5db-c69256067522",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "joined_df_5 = joined_df_5.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "id": "4c9566fb-d59c-4505-a853-cfe115a5876d",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "joined_df_5 = joined_df_5[joined_df_5['State'] == 'CA']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38aea530-08c1-44ca-8765-2f4cb98361fd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cal hospital compare measures \n",
    "CHC_METRICS = ['HAI_1_SIR','HAI_2_SIR','HAI_3_SIR','HAI_5_SIR','HAI_6_SIR', 'PSI_90', 'SEP_1', 'H_COMP_1_A_P', 'H_COMP_2_A_P', 'H_COMP_3_A_P', 'H_COMP_5_A_P', 'H_COMP_7_SA']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "id": "edb54a81-a03f-474d-bfe1-9f42d46da18f",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "output_df = joined_df_5.dropna(subset=CHC_METRICS, thresh=6).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "f80ee79c-088e-4b17-b82a-f5455353966a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Convert all columns to \"higher is better\"\n",
    "lower_is_better_columns = [('HAI_%d_SIR' % (i)) for i in range(1,7)] + ['PSI_03', 'PSI_04', 'PSI_06', 'PSI_08','PSI_09', 'PSI_10', 'PSI_11', 'PSI_12', 'PSI_13', 'PSI_14', 'PSI_15','PSI_90'] + ['COMP_HIP_KNEE', 'MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 'MORT_30_PN', 'MORT_30_STK'] + ['ePC_07a', 'ePC_07b']+ ['OP_32', 'OP_36', 'OP_35_ADM', 'OP_35_ED', 'READM_30_AMI', 'READM_30_COPD', 'READM_30_CABG', 'READM_30_HF', 'READM_30_HIP_KNEE', 'READM_30_PN', 'READM_30_HOSP_WIDE', 'EDAC_30_AMI', 'EDAC_30_HF', 'EDAC_30_PN']\n",
    "\n",
    "lower_is_better_columns\n",
    "for column in lower_is_better_columns:\n",
    "    output_df[column] = -output_df[column]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "3c89ed1b-d8fa-44af-8ec6-f30583c22229",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "output_df.to_csv('all_cms_metrics_ca.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
