{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/florisholstege/Documents/GitHub/opt-separation/.env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Correlation between fact and stereotype scores on test set:  0.24638547733132893\n",
      "n in test set:  1111\n",
      "n of professions in test set:  101\n",
      "Correlation between fact and stereotype scores on train set:  0.6963347866910748\n",
      "n in train set:  2409\n",
      "n of professions in train set:  219\n",
      "Overlap between train and test sets:  0\n"
     ]
    }
   ],
   "source": [
    "from data import DamaData\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import random\n",
    "import pickle\n",
    "import json\n",
    "\n",
    "# Initialize the DAMA dataset handler\n",
    "dama_data = DamaData()\n",
    "\n",
    "# Create the dataset\n",
    "df = dama_data.create_dataset()\n",
    "\n",
    "# write it out to data/dama/dama_professions.csv\n",
    "#df.to_csv('data/dama/dama_professions.csv', index=False)\n",
    "df_test = df[df['split']=='test']\n",
    "df_train = df[df['split']=='train']\n",
    "\n",
    "\n",
    "corr= np.corrcoef(df_test['fact_score'], df_test['stereotype_score'])[0,1]\n",
    "print('Correlation between fact and stereotype scores on test set: ', corr)\n",
    "print('n in test set: ', len(df_test))\n",
    "print('n of professions in test set: ', len(df_test['profession'].unique()))\n",
    "\n",
    "corr= np.corrcoef(df_train['fact_score'], df_train['stereotype_score'])[0,1]\n",
    "print('Correlation between fact and stereotype scores on train set: ', corr)\n",
    "print('n in train set: ', len(df_train))\n",
    "print('n of professions in train set: ', len(df_train['profession'].unique()))\n",
    "\n",
    "# do these overlap?\n",
    "print('Overlap between train and test sets: ', len(set(df_test['profession'].unique()).intersection(set(df_train['profession'].unique()))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def compute_corr(y, z, indices):\n",
    "    \"\"\"Compute the Pearson correlation for the subset defined by indices.\"\"\"\n",
    "    subset_y = y[indices]\n",
    "    subset_z = z[indices]\n",
    "    # Handle the case where standard deviation might be zero\n",
    "    if np.std(subset_y) == 0 or np.std(subset_z) == 0:\n",
    "        return 0.0\n",
    "    return np.corrcoef(subset_y, subset_z)[0, 1]\n",
    "\n",
    "def find_subset_with_target_corr(y, z, m, target_corr, \n",
    "                                 n_iter=100000, initial_temp=1.0, cooling_rate=0.999, tol=1e-6, seed=0):\n",
    "    \"\"\"\n",
    "    Find a subset of indices of size m such that the correlation between y and z\n",
    "    on this subset is as close as possible to the target correlation.\n",
    "    \n",
    "    Parameters:\n",
    "        y, z         : 1D numpy arrays of observations.\n",
    "        m            : Size of the desired subsample.\n",
    "        target_corr  : Desired target correlation.\n",
    "        n_iter       : Number of iterations for simulated annealing.\n",
    "        initial_temp : Starting temperature for simulated annealing.\n",
    "        cooling_rate : Factor to reduce the temperature each iteration.\n",
    "        \n",
    "    Returns:\n",
    "        best_indices : The subset of indices achieving the best (closest) correlation.\n",
    "        best_corr    : The correlation corresponding to best_indices.\n",
    "        best_obj     : The absolute difference |corr - target_corr| for best_indices.\n",
    "    \"\"\"\n",
    "    n = len(y)\n",
    "    # Start with a random subset of m indices.\n",
    "    random.seed(seed)\n",
    "    current_indices = random.sample(range(n), m)\n",
    "    current_corr = compute_corr(y, z, current_indices)\n",
    "    current_obj = abs(current_corr - target_corr)\n",
    "    \n",
    "    # Keep track of the best found subset.\n",
    "    best_indices = current_indices.copy()\n",
    "    best_obj = current_obj\n",
    "    \n",
    "    T = initial_temp\n",
    "    for _ in range(n_iter):\n",
    "        # Propose a new subset by swapping one element.\n",
    "        current_set = set(current_indices)\n",
    "        not_in_subset = list(set(range(n)) - current_set)\n",
    "        \n",
    "        # Randomly choose one index to remove and one to add.\n",
    "        idx_remove = random.choice(current_indices)\n",
    "        idx_add = random.choice(not_in_subset)\n",
    "        \n",
    "        new_indices = current_indices.copy()\n",
    "        new_indices.remove(idx_remove)\n",
    "        new_indices.append(idx_add)\n",
    "        \n",
    "        new_corr = compute_corr(y, z, new_indices)\n",
    "        new_obj = abs(new_corr - target_corr)\n",
    "        \n",
    "        # Decide whether to accept the new subset.\n",
    "        delta = new_obj - current_obj\n",
    "        if delta < 0 or np.exp(-delta / T) > random.random():\n",
    "            current_indices = new_indices\n",
    "            current_obj = new_obj\n",
    "            current_corr = new_corr\n",
    "            # Update best if improved.\n",
    "            if new_obj < best_obj:\n",
    "                best_indices = new_indices.copy()\n",
    "                best_obj = new_obj\n",
    "        \n",
    "        # Cool down the temperature.\n",
    "        T *= cooling_rate\n",
    "\n",
    "        if abs(current_obj - target_corr) < tol:\n",
    "            break\n",
    "\n",
    "        \n",
    "    best_corr = compute_corr(y, z, best_indices)\n",
    "    return best_indices, best_corr, best_obj"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def find_and_save_uncorrelated_subsets(df_train, df_test, subset_size=1000, target_corr=0.0, \n",
    "                                      n_iter=100000, tol=1e-4, seed=1, save_dir='data/dama/'):\n",
    "    \"\"\"\n",
    "    Finds uncorrelated subsets of fact and stereotype scores for both train and test sets\n",
    "    and saves the resulting indices.\n",
    "    \n",
    "    Args:\n",
    "        df_train (pd.DataFrame): Training dataframe with fact_score and stereotype_score columns\n",
    "        df_test (pd.DataFrame): Test dataframe with fact_score and stereotype_score columns\n",
    "        subset_size (int): Size of the desired subsets\n",
    "        target_corr (float): Target correlation to achieve\n",
    "        n_iter (int): Number of iterations for finding subsets\n",
    "        tol (float): Tolerance for correlation matching\n",
    "        seed (int): Random seed for reproducibility\n",
    "        save_dir (str): Directory to save the indices files\n",
    "        \n",
    "    Returns:\n",
    "        dict: Dictionary containing results for both train and test sets\n",
    "    \"\"\"\n",
    "    results = {}\n",
    "    \n",
    "    # Process test set\n",
    "    y_te = df_test['fact_score'].values\n",
    "    z_te = df_test['stereotype_score'].values\n",
    "    \n",
    "    print('Test set statistics:')\n",
    "    print(f'min/max of fact scores: {np.min(y_te):.3f}, {np.max(y_te):.3f}')\n",
    "    print(f'min/max of stereotype scores: {np.min(z_te):.3f}, {np.max(z_te):.3f}')\n",
    "    \n",
    "    best_indices_test, best_corr_test, best_obj_test = find_subset_with_target_corr(\n",
    "        y_te, z_te, subset_size, target_corr, n_iter=n_iter, tol=tol, seed=seed\n",
    "    )\n",
    "    \n",
    "    print(f'Best correlation on test set: {best_corr_test:.4f}')\n",
    "    \n",
    "    # Save test indices\n",
    "    with open(f'{save_dir}test_subset_indices.pkl', 'wb') as f:\n",
    "        pickle.dump(best_indices_test, f)\n",
    "    \n",
    "    results['test'] = {\n",
    "        'indices': best_indices_test,\n",
    "        'correlation': best_corr_test,\n",
    "        'objective': best_obj_test\n",
    "    }\n",
    "    \n",
    "    # Process train set\n",
    "    y_tr = df_train['fact_score'].values\n",
    "    z_tr = df_train['stereotype_score'].values\n",
    "    \n",
    "    print('\\nTrain set statistics:')\n",
    "    print(f'min/max of fact scores: {np.min(y_tr):.3f}, {np.max(y_tr):.3f}')\n",
    "    print(f'min/max of stereotype scores: {np.min(z_tr):.3f}, {np.max(z_tr):.3f}')\n",
    "    \n",
    "    best_indices_train, best_corr_train, best_obj_train = find_subset_with_target_corr(\n",
    "        y_tr, z_tr, subset_size, target_corr, n_iter=n_iter, tol=tol, seed=seed\n",
    "    )\n",
    "    \n",
    "    print(f'Best correlation on train set: {best_corr_train:.4f}')\n",
    "    \n",
    "    # Save train indices\n",
    "    with open(f'{save_dir}train_subset_indices.pkl', 'wb') as f:\n",
    "        pickle.dump(best_indices_train, f)\n",
    "    \n",
    "    results['train'] = {\n",
    "        'indices': best_indices_train,\n",
    "        'correlation': best_corr_train,\n",
    "        'objective': best_obj_train\n",
    "    }\n",
    "    \n",
    "    return results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test set statistics:\n",
      "min/max of fact scores: -1.000, 1.000\n",
      "min/max of stereotype scores: -0.800, 0.900\n",
      "Best correlation on test set: -0.0000\n",
      "\n",
      "Train set statistics:\n",
      "min/max of fact scores: -0.200, 0.200\n",
      "min/max of stereotype scores: -0.900, 0.900\n",
      "Best correlation on train set: 0.0000\n",
      "Test correlation:  -8.070244425968051e-06\n",
      "Train correlation:  4.497913254443942e-05\n"
     ]
    }
   ],
   "source": [
    "# Generate example data: y and z are arrays with values in [-1, 1].\n",
    "n_iter = 100000\n",
    "tol = 1e-4\n",
    "seed=0\n",
    "\n",
    "# create a sample test set\n",
    "m_test = 1000\n",
    "target_corr=0.0\n",
    "\n",
    "# Find uncorrelated subsets and save indices\n",
    "results = find_and_save_uncorrelated_subsets(\n",
    "    df_train=df_train,\n",
    "    df_test=df_test,\n",
    "    subset_size=m_test,\n",
    "    target_corr=target_corr,\n",
    "    n_iter=n_iter,\n",
    "    tol=tol,\n",
    "    seed=seed\n",
    ")\n",
    "\n",
    "# Access results if needed\n",
    "test_correlation = results['test']['correlation']\n",
    "train_correlation = results['train']['correlation']\n",
    "print('Test correlation: ', test_correlation)\n",
    "print('Train correlation: ', train_correlation)\n",
    "\n",
    "# load the indices\n",
    "best_indices_test = results['test']['indices']\n",
    "best_indices_train = results['train']['indices']\n",
    "\n",
    "# save the indices as an pickle file\n",
    "with open(f'data/dama/test_subset_indices.pkl', 'wb') as f:\n",
    "    pickle.dump(best_indices_test, f)\n",
    "\n",
    "\n",
    "# save the indices as an pickle file\n",
    "with open(f'data/dama/train_subset_indices.pkl', 'wb') as f:\n",
    "    pickle.dump(best_indices_train, f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "n train professions:  219\n",
      "n test professions:  101\n",
      "min/max of y_te:  -1.0 1.0\n",
      "min/max of z_te:  -0.9 0.9\n",
      "Correlation between fact and stereotype scores on test set:  0.1713702853072136\n",
      "n in test set:  1111\n",
      "min/max of y_tr:  -1.0 1.0\n",
      "min/max of z_tr:  -0.9 0.9\n",
      "Correlation between fact and stereotype scores on train set:  0.30491143708574847\n",
      "n in train set:  2409\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/qx/tdl_yh8x5fd66scb3mf3q0jw0000gn/T/ipykernel_81133/1942807296.py:37: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_mixed_train['split'] = 'train'\n",
      "/var/folders/qx/tdl_yh8x5fd66scb3mf3q0jw0000gn/T/ipykernel_81133/1942807296.py:38: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_mixed_test['split'] = 'test'\n"
     ]
    }
   ],
   "source": [
    "def create_split_profession_data(df_train, df_test, random_seed=42):\n",
    "    \"\"\"\n",
    "    Creates new train and test sets by randomly selecting professions.\n",
    "    Optionally ensures test set correlation is close to 0.\n",
    "    \n",
    "    Args:\n",
    "        df_train (pd.DataFrame): Original training dataframe\n",
    "        df_test (pd.DataFrame): Original test dataframe\n",
    "        random_seed (int): Random seed for reproducibility\n",
    "\n",
    "        \n",
    "    Returns:\n",
    "        tuple: (new_train_df, new_test_df)\n",
    "    \"\"\"\n",
    "    random.seed(random_seed)\n",
    "    \n",
    "  \n",
    "    # Original implementation\n",
    "    all_professions = list(set(df_train['profession'].unique()) | set(df_test['profession'].unique()))\n",
    "    n_train_profs = len(df_train['profession'].unique())\n",
    "    n_test_profs = len(df_test['profession'].unique())\n",
    "    \n",
    "    # Shuffle professions\n",
    "    random.shuffle(all_professions)\n",
    "    train_professions = set(all_professions[:n_train_profs])\n",
    "    print('n train professions: ', len(train_professions))\n",
    "    test_professions = set(all_professions[n_train_profs:n_train_profs + n_test_profs])\n",
    "    print('n test professions: ', len(test_professions))\n",
    "\n",
    "    \n",
    "    # combine to one df\n",
    "    df_mixed = pd.concat([df_train, df_test])\n",
    "    df_mixed_train = df_mixed[df_mixed['profession'].isin(train_professions)]\n",
    "    df_mixed_test = df_mixed[df_mixed['profession'].isin(test_professions)]\n",
    "\n",
    "    # now set the appropriate split\n",
    "    df_mixed_train['split'] = 'train'\n",
    "    df_mixed_test['split'] = 'test'\n",
    "\n",
    "    \n",
    "    return df_mixed_train, df_mixed_test\n",
    "    \n",
    "    \n",
    "\n",
    "# Create new train and test sets\n",
    "df_train_mixed, df_test_mixed = create_split_profession_data(df_train, df_test, random_seed=seed)\n",
    "\n",
    "# combine the train and test sets, set split before\n",
    "df_train_mixed['split'] = 'train'\n",
    "df_test_mixed['split'] = 'test'\n",
    "\n",
    "\n",
    "# check it all\n",
    "y_te = df_test_mixed['fact_score'].values\n",
    "z_te = df_test_mixed['stereotype_score'].values\n",
    "print('min/max of y_te: ', np.min(y_te), np.max(y_te))\n",
    "print('min/max of z_te: ', np.min(z_te), np.max(z_te))\n",
    "\n",
    "corr= np.corrcoef(df_test_mixed['fact_score'], df_test_mixed['stereotype_score'])[0,1]\n",
    "print('Correlation between fact and stereotype scores on test set: ', corr)\n",
    "print('n in test set: ', len(df_test_mixed))\n",
    "\n",
    "y_tr = df_train_mixed['fact_score'].values\n",
    "z_tr = df_train_mixed['stereotype_score'].values\n",
    "print('min/max of y_tr: ', np.min(y_tr), np.max(y_tr))\n",
    "print('min/max of z_tr: ', np.min(z_tr), np.max(z_tr))\n",
    "\n",
    "corr= np.corrcoef(df_train_mixed['fact_score'], df_train_mixed['stereotype_score'])[0,1]\n",
    "print('Correlation between fact and stereotype scores on train set: ', corr)\n",
    "print('n in train set: ', len(df_train_mixed))\n",
    "\n",
    "# save the new data\n",
    "df_mixed = pd.concat([df_train_mixed, df_test_mixed])\n",
    "df_mixed.to_csv('data/dama/dama_professions_mixed.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test set statistics:\n",
      "min/max of fact scores: -1.000, 1.000\n",
      "min/max of stereotype scores: -0.900, 0.900\n",
      "Best correlation on test set: 0.0001\n",
      "\n",
      "Train set statistics:\n",
      "min/max of fact scores: -1.000, 1.000\n",
      "min/max of stereotype scores: -0.900, 0.900\n",
      "Best correlation on train set: -0.0000\n",
      "Test correlation:  9.584024496635765e-05\n",
      "Train correlation:  -1.8903832507493046e-05\n"
     ]
    }
   ],
   "source": [
    "\n",
    "m_test_mixed = 950\n",
    "\n",
    "# Find uncorrelated subsets and save indices\n",
    "results_mixed = find_and_save_uncorrelated_subsets(\n",
    "    df_train=df_train_mixed,\n",
    "    df_test=df_test_mixed,\n",
    "    subset_size=m_test_mixed,\n",
    "    target_corr=target_corr,\n",
    "    n_iter=n_iter,\n",
    "    tol=tol,\n",
    "    seed=seed\n",
    ")\n",
    "\n",
    "# Access results if needed\n",
    "test_correlation_mixed = results_mixed['test']['correlation']\n",
    "train_correlation_mixed = results_mixed['train']['correlation']\n",
    "\n",
    "# load the indices\n",
    "best_indices_test_mixed = results_mixed['test']['indices']\n",
    "best_indices_train_mixed = results_mixed['train']['indices']\n",
    "\n",
    "# save the indices as an pickle file\n",
    "with open(f'data/dama/test_subset_indices_mixed.pkl', 'wb') as f:\n",
    "    pickle.dump(best_indices_test_mixed, f)\n",
    "\n",
    "# save the indices as an pickle file\n",
    "with open(f'data/dama/train_subset_indices_mixed.pkl', 'wb') as f:\n",
    "    pickle.dump(best_indices_train_mixed, f)\n",
    "\n",
    "print('Test correlation: ', test_correlation_mixed)\n",
    "print('Train correlation: ', train_correlation_mixed)\n",
    "\n",
    "# "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the professions.json file\n",
    "with open('data/dama/professions.json', 'r') as f:\n",
    "    professions = json.load(f)\n",
    "    \n",
    "\n",
    "# turn into dataframe from the list\n",
    "professions_df = pd.DataFrame(professions)\n",
    "professions_df.columns = ['profession', 'stereotype_score', 'fact_score']\n",
    "\n",
    "# to excel\n",
    "professions_df.to_excel('data/dama/professions.xlsx', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
