{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "!git clone https://github.com/anonymousindividual007/Multi-environment-Topic-Models"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1HcwgerIiF40",
        "outputId": "bb223c2f-2a32-43d5-e5b6-3669f5a87f8a"
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'Multi-environment-Topic-Models'...\n",
            "remote: Enumerating objects: 82, done.\u001b[K\n",
            "remote: Counting objects: 100% (82/82), done.\u001b[K\n",
            "remote: Compressing objects: 100% (82/82), done.\u001b[K\n",
            "remote: Total 82 (delta 40), reused 0 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
            "Receiving objects: 100% (82/82), 42.64 MiB | 9.59 MiB/s, done.\n",
            "Resolving deltas: 100% (40/40), done.\n",
            "Updating files: 100% (17/17), done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {
        "id": "PGAla_frE5jG"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.metrics import classification_report\n",
        "import statsmodels.api as sm\n",
        "import zipfile\n",
        "import os"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "The topic_proportions folder contains the topic proportions from other baseline models. Change the zipfile path to access them. In a couple cells below you should also change the topic proportion you index into."
      ],
      "metadata": {
        "id": "V0ad8Ok648j4"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "syn_experiment='senior'"
      ],
      "metadata": {
        "id": "vF7iIpPfXj2H"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "temp_dir = '/content/temp_dir'\n",
        "\n",
        "if not os.path.exists(temp_dir):\n",
        "    os.makedirs(temp_dir)\n",
        "\n",
        "if syn_experiment=='energy':\n",
        "\n",
        "    zip_file_path_1 = '/content/Multi-environment-Topic-Models/VTM_train_topic_proportions_lda.csv.zip'\n",
        "    csv_file_name_1 = 'VTM_train_topic_proportions_lda.csv'\n",
        "\n",
        "    zip_file_path_2 = '/content/Multi-environment-Topic-Models/eatm2_topic_proportions.csv.zip'\n",
        "    csv_file_name_2 = 'eatm2_topic_proportions.csv'\n",
        "\n",
        "    with zipfile.ZipFile(zip_file_path_1, 'r') as zip_ref:\n",
        "        zip_ref.extract(csv_file_name_1, temp_dir)\n",
        "\n",
        "    with zipfile.ZipFile(zip_file_path_2, 'r') as zip_ref:\n",
        "        zip_ref.extract(csv_file_name_2, temp_dir)\n",
        "\n",
        "    csv_file_path_1 = os.path.join(temp_dir, csv_file_name_1)\n",
        "    csv_file_path_2 = os.path.join(temp_dir, csv_file_name_2)\n",
        "\n",
        "    mtm = pd.read_csv(csv_file_path_2)\n",
        "    vtm = pd.read_csv(csv_file_path_1)\n",
        "\n",
        "\n",
        "else:\n",
        "\n",
        "    zip_file_path_1 = '/content/Multi-environment-Topic-Models/VTM_tp_immi.csv.zip'\n",
        "    csv_file_name_1 = 'VTM_tp_immi.csv'\n",
        "\n",
        "    zip_file_path_2 = '/content/Multi-environment-Topic-Models/2mtm_medicare_ss_topics.csv.zip'\n",
        "    csv_file_name_2 = '2mtm_medicare_ss_topics.csv'\n",
        "\n",
        "    with zipfile.ZipFile(zip_file_path_1, 'r') as zip_ref:\n",
        "        zip_ref.extract(csv_file_name_1, temp_dir)\n",
        "\n",
        "    with zipfile.ZipFile(zip_file_path_2, 'r') as zip_ref:\n",
        "        zip_ref.extract(csv_file_name_2, temp_dir)\n",
        "\n",
        "    csv_file_path_1 = os.path.join(temp_dir, csv_file_name_1)\n",
        "    csv_file_path_2 = os.path.join(temp_dir, csv_file_name_2)\n",
        "\n",
        "    mtm = pd.read_csv(csv_file_path_2)\n",
        "    vtm = pd.read_csv(csv_file_path_1)\n",
        "\n",
        "\n",
        "\n",
        "data = pd.read_csv('/content/Multi-environment-Topic-Models/ci_mtm_senior_energy.csv')"
      ],
      "metadata": {
        "id": "8fhojo9wNHyr"
      },
      "execution_count": 4,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "mtm = mtm.drop(columns=['Unnamed: 0'])"
      ],
      "metadata": {
        "id": "0_I4a2HQi7wT"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Uncomment for MTM proportions from the energy experiment"
      ],
      "metadata": {
        "id": "TonXC38RYPLC"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#energy\n",
        "# def process_row_mtm(row):\n",
        "#     if row['Topic_28'] == max(row):\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "\n",
        "# mtm['binary 28'] = mtm.apply(process_row_mtm, axis=1)"
      ],
      "metadata": {
        "id": "whL8bf-1YOlR"
      },
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Senior experiment: MTM"
      ],
      "metadata": {
        "id": "Y_hbGSu0YVUT"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def process_row_mtm(row):\n",
        "    if row['Topic_15'] == max(row):\n",
        "        return 1\n",
        "    else:\n",
        "        return 0\n",
        "\n",
        "mtm['binary topic'] = mtm.apply(process_row_mtm, axis=1)"
      ],
      "metadata": {
        "id": "ouP6geciUwdS"
      },
      "execution_count": 20,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Energy topic for baseline models. The different values we index row with represent the topic corresponding to energy or senior social policies for each experiment. Uncomment the one that corresponds to which model you would like to evaluate."
      ],
      "metadata": {
        "id": "F85Ki9dmZmDe"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#vtm\n",
        "# def process_row_vtm_15(row):\n",
        "#     if row['15'] == max(row):\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "# vtm_topics['binary 15'] = vtm_topics.apply(process_row_vtm_15, axis=1)\n",
        "\n",
        "\n",
        "#lda\n",
        "# def process_row_vtm_15(row):\n",
        "#     if row.idxmax() == 'Topic_11':\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "\n",
        "#prodlda\n",
        "# def process_row_vtm_15(row):\n",
        "#     if row.idxmax() == 'Topic_1':\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "\n",
        "#bertopic\n",
        "# def process_row_vtm_15(row):\n",
        "#     if row.idxmax() == 'Topic_19':\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "\n"
      ],
      "metadata": {
        "id": "TWDHuW7pZkrX"
      },
      "execution_count": 21,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Senior topic for baseline models"
      ],
      "metadata": {
        "id": "IXXqL7gcZpDM"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#senior\n",
        "#prodlda\n",
        "# def process_row_vtm_15(row):\n",
        "#     if row.idxmax() == 'Topic_21':\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "\n",
        "#gensim\n",
        "# def process_row_vtm_15(row):\n",
        "#     if row.idxmax() == 'Topic_8':\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "\n",
        "#vtm\n",
        "def process_row_vtm_15(row):\n",
        "    if row.idxmax() == 'Topic_0':\n",
        "        return 1\n",
        "    else:\n",
        "        return 0\n",
        "\n",
        "# #bertopic\n",
        "# def process_row_vtm_15(row):\n",
        "#     if row.idxmax() == 'Topic_2':\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "\n",
        "\n",
        "vtm['binary 15'] = vtm.apply(process_row_vtm_15, axis=1)\n",
        "\n"
      ],
      "metadata": {
        "id": "0dlmyT8Jktw7"
      },
      "execution_count": 22,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Combined topics for the VTM in the energy experiment"
      ],
      "metadata": {
        "id": "mMbI6QnBYozi"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# def process_row_vtm(row):\n",
        "#     max_value = max(row)\n",
        "#     # Check if the maximum value corresponds to either '15' or '21'\n",
        "#     if row['15'] == max_value or row['21'] == max_value:\n",
        "#         return 1\n",
        "#     else:\n",
        "#         return 0\n",
        "\n",
        "# vtm_topics['binary topic'] = vtm_topics.apply(process_row_vtm, axis=1)"
      ],
      "metadata": {
        "id": "343l5metukao"
      },
      "execution_count": 23,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Create an indicator variable for the 'source' column\n",
        "data['source_indicator'] = data['source'].apply(lambda x: 1 if x == 'Republican' else 0)"
      ],
      "metadata": {
        "id": "8Qu4Xm8nZWl-"
      },
      "execution_count": 24,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "merged_df_vtm = data.merge(vtm, left_index=True, right_index=True)\n",
        "merged_df_mtm = data.merge(mtm, left_index=True, right_index=True)\n"
      ],
      "metadata": {
        "id": "GVUdDz1KZxte"
      },
      "execution_count": 25,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Change is_senior to is_energy for the energy experiment"
      ],
      "metadata": {
        "id": "7FZMvj1RaCEm"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "#sampling senior\n",
        "senior_true_sample_2 = merged_df_mtm[merged_df_mtm['is_senior'] == True].sample(n=700, random_state=1)\n",
        "senior_false_sample_2 = merged_df_mtm[merged_df_mtm['is_senior'] == False].sample(n=700, random_state=1)\n",
        "final_mtm = pd.concat([senior_true_sample_2, senior_false_sample_2])"
      ],
      "metadata": {
        "id": "iWmUZMU-aLIK"
      },
      "execution_count": 26,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "senior_true_sample_1 = merged_df_vtm[merged_df_vtm['is_senior'] == True].sample(n=700, random_state=1)\n",
        "senior_false_sample_1 = merged_df_vtm[merged_df_vtm['is_senior'] == False].sample(n=700, random_state=1)\n",
        "final_vtm = pd.concat([senior_true_sample_1, senior_false_sample_1])"
      ],
      "metadata": {
        "id": "aI4Z12ZoaEwT"
      },
      "execution_count": 27,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "baseline_prob = 0.5\n"
      ],
      "metadata": {
        "id": "7FZOxcFJMRNA"
      },
      "execution_count": 28,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def adjust_probability(row, baseline_prob=baseline_prob, seed=None):\n",
        "    # if row['is_energy']:\n",
        "    if row['is_senior']:\n",
        "\n",
        "        noise = np.random.uniform(0, 0.1)\n",
        "        adjusted_prob = min(baseline_prob + 0.2 + noise, 1)\n",
        "        return np.random.binomial(1, adjusted_prob)\n",
        "    else:\n",
        "        return np.random.binomial(1, baseline_prob)\n",
        "\n",
        "# Initialize oucomes with a fixed seed for reproducibility\n",
        "np.random.seed(10)\n",
        "final_vtm['outcome'] = np.random.binomial(1, baseline_prob, size=len(final_vtm))\n",
        "final_mtm['outcome']=np.random.binomial(1, baseline_prob, size=len(final_mtm))\n",
        "\n",
        "final_vtm['outcome'] = final_vtm.apply(adjust_probability, axis=1)\n",
        "final_mtm['outcome'] = final_mtm.apply(adjust_probability, axis=1)"
      ],
      "metadata": {
        "id": "JuPWU3lqMLxh"
      },
      "execution_count": 29,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "X_2 = final_mtm[['binary topic', 'source_indicator']]\n",
        "y_2 = final_mtm['outcome']\n",
        "X_2 = sm.add_constant(X_2)\n",
        "model_eatm2 = sm.OLS(y_2, X_2).fit(cov_type='HC3')\n",
        "print(model_eatm2.summary())"
      ],
      "metadata": {
        "id": "BT-pxeB3Y5CT",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "dc8bde87-431c-4dda-b8e2-d51b16d5d904"
      },
      "execution_count": 30,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "                            OLS Regression Results                            \n",
            "==============================================================================\n",
            "Dep. Variable:                outcome   R-squared:                       0.042\n",
            "Model:                            OLS   Adj. R-squared:                  0.041\n",
            "Method:                 Least Squares   F-statistic:                     34.55\n",
            "Date:                Tue, 01 Oct 2024   Prob (F-statistic):           2.25e-15\n",
            "Time:                        14:15:02   Log-Likelihood:                -948.92\n",
            "No. Observations:                1400   AIC:                             1904.\n",
            "Df Residuals:                    1397   BIC:                             1920.\n",
            "Df Model:                           2                                         \n",
            "Covariance Type:                  HC3                                         \n",
            "====================================================================================\n",
            "                       coef    std err          z      P>|z|      [0.025      0.975]\n",
            "------------------------------------------------------------------------------------\n",
            "const                0.5718      0.021     27.891      0.000       0.532       0.612\n",
            "binary topic         0.2025      0.027      7.595      0.000       0.150       0.255\n",
            "source_indicator    -0.0467      0.026     -1.766      0.077      -0.098       0.005\n",
            "==============================================================================\n",
            "Omnibus:                     7005.542   Durbin-Watson:                   1.952\n",
            "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              200.454\n",
            "Skew:                          -0.420   Prob(JB):                     2.96e-44\n",
            "Kurtosis:                       1.348   Cond. No.                         3.04\n",
            "==============================================================================\n",
            "\n",
            "Notes:\n",
            "[1] Standard Errors are heteroscedasticity robust (HC3)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "vtm_X_2 = final_vtm[['binary 15', 'source_indicator']]\n",
        "vtm_y_2 = final_vtm['outcome']\n",
        "vX_2 = sm.add_constant(vtm_X_2)\n",
        "model_vtm_2 = sm.OLS(vtm_y_2, vX_2).fit(cov_type='HC3')\n",
        "print(model_vtm_2.summary())"
      ],
      "metadata": {
        "id": "jp66_obgaJrl",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "422ad7be-8bdc-4368-d288-19426344a765"
      },
      "execution_count": 31,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "                            OLS Regression Results                            \n",
            "==============================================================================\n",
            "Dep. Variable:                outcome   R-squared:                       0.004\n",
            "Model:                            OLS   Adj. R-squared:                  0.003\n",
            "Method:                 Least Squares   F-statistic:                     4.971\n",
            "Date:                Tue, 01 Oct 2024   Prob (F-statistic):             0.0259\n",
            "Time:                        14:15:02   Log-Likelihood:                -964.60\n",
            "No. Observations:                1400   AIC:                             1933.\n",
            "Df Residuals:                    1398   BIC:                             1944.\n",
            "Df Model:                           1                                         \n",
            "Covariance Type:                  HC3                                         \n",
            "====================================================================================\n",
            "                       coef    std err          z      P>|z|      [0.025      0.975]\n",
            "------------------------------------------------------------------------------------\n",
            "const                0.6553      0.017     38.754      0.000       0.622       0.688\n",
            "binary 15                 0          0        nan        nan           0           0\n",
            "source_indicator    -0.0583      0.026     -2.230      0.026      -0.109      -0.007\n",
            "==============================================================================\n",
            "Omnibus:                     6628.596   Durbin-Watson:                   1.761\n",
            "Prob(Omnibus):                  0.000   Jarque-Bera (JB):              235.064\n",
            "Skew:                          -0.536   Prob(JB):                     9.05e-52\n",
            "Kurtosis:                       1.302   Cond. No.                          inf\n",
            "==============================================================================\n",
            "\n",
            "Notes:\n",
            "[1] Standard Errors are heteroscedasticity robust (HC3)\n",
            "[2] The smallest eigenvalue is      0. This might indicate that there are\n",
            "strong multicollinearity problems or that the design matrix is singular.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/statsmodels/regression/linear_model.py:1966: RuntimeWarning: divide by zero encountered in scalar divide\n",
            "  return np.sqrt(eigvals[0]/eigvals[-1])\n",
            "/usr/local/lib/python3.10/dist-packages/statsmodels/base/model.py:1894: ValueWarning: covariance of constraints does not have full rank. The number of constraints is 2, but rank is 1\n",
            "  warnings.warn('covariance of constraints does not have full '\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "-mZMxPUPuwXA"
      },
      "execution_count": 31,
      "outputs": []
    }
  ]
}