{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from contamination import GSM8K, MMLU, ARC, TruthfulQA\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import copy\n",
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bootstrap(data1 : np.array, data2=None, n_bootstrap=1000):\n",
    "    values = []\n",
    "    for _ in range(n_bootstrap):\n",
    "        random_indices = np.random.choice(range(len(data1)), len(data1), replace=True)\n",
    "        if data2 is None:\n",
    "            values.append(data1[random_indices].mean())\n",
    "        else:\n",
    "            values.append(data1[random_indices].mean() - data2[random_indices].mean())\n",
    "    return np.std(values)\n",
    "\n",
    "def map_latex(value1, value2):\n",
    "    return f\"${value1:.0f}\" r\"_{\\pm\" +f\"{value2:.0f}\" + \"} $\"\n",
    "\n",
    "def get_performance(model_name, task, dataset_name, types=['', '/epochs_1'], do_delta=False, \n",
    "                    background_data='', proportion=None, do_yang=False, new_tactic=False):\n",
    "    new_tactic_str = '-new-tactic' if new_tactic else ''\n",
    "    try:\n",
    "        if proportion is None:\n",
    "            baseline = pd.read_csv(f'../output/{model_name}/seed/0/{dataset_name}/generated_0.csv')\n",
    "            was_trained = pd.read_csv(f'../output/{model_name}/test/{dataset_name}/0{new_tactic_str}/generated_4.csv')['was_trained']\n",
    "        else:\n",
    "            baseline = pd.read_csv(f'../output/{model_name}/seed/{background_data}/0/{dataset_name}/generated_0.csv')\n",
    "            was_trained = pd.read_csv(f'../output/{model_name}/test/{background_data}/{proportion}_{dataset_name}{types[0]}/0/generated_0.csv')['was_trained']\n",
    "    except Exception:\n",
    "        if proportion is None:\n",
    "            baseline = pd.read_csv(f'../output/{model_name}/seed/0/{dataset_name}/generated_0.csv')\n",
    "            was_trained = pd.read_csv(f'../output/{model_name}/test/{dataset_name}/epochs_1/0{new_tactic_str}/generated_4.csv')['was_trained']\n",
    "        else:\n",
    "            baseline = pd.read_csv(f'../output/{model_name}/seed/{background_data}/0/{dataset_name}/generated_0.csv')\n",
    "            was_trained = pd.read_csv(f'../output/{model_name}/test/{background_data}/{proportion}_{dataset_name}{types[0]}/0{new_tactic_str}/generated_0.csv')['was_trained']\n",
    "    \n",
    "    if do_yang:\n",
    "        was_trained_2 = pd.read_csv(f'../output/{model_name}/test/{dataset_name}/2{new_tactic_str}/generated_0.csv')['was_trained']\n",
    "    else:\n",
    "        was_trained_2 = pd.read_csv(f'../output/{model_name}/test/{dataset_name}/0{new_tactic_str}/generated_0.csv')['was_trained']\n",
    "    baseline_score_c = task.compute_performance(baseline[was_trained==True])\n",
    "    baseline_score_c_2 = task.compute_performance(baseline[was_trained_2==True])\n",
    "    baseline_score_u = task.compute_performance(baseline[was_trained==False])\n",
    "    baseline_score_u_2 = task.compute_performance(baseline[was_trained_2==False])\n",
    "    baseline_score_contaminated = map_latex(baseline_score_c['score'].mean() * 100, 196 * bootstrap(baseline_score_c['score'].values))\n",
    "    baseline_score_contaminated_2 = map_latex(baseline_score_c_2['score'].mean() * 100, 196 * bootstrap(baseline_score_c_2['score'].values))\n",
    "    baseline_score_uncontaminated = map_latex(baseline_score_u['score'].mean() * 100, 196 * bootstrap(baseline_score_u['score'].values))\n",
    "    baseline_score_uncontaminated_2 = map_latex(baseline_score_u_2['score'].mean() * 100, 196 * bootstrap(baseline_score_u_2['score'].values))\n",
    "\n",
    "    if proportion is None:\n",
    "        baseline = pd.read_csv(f'../output/{model_name}/seed/0/{dataset_name}/generated_4.csv')\n",
    "        baseline_rephrase = task.compute_performance(baseline[was_trained == True])\n",
    "        baseline_score_rephrase = map_latex(baseline_rephrase['score'].mean() * 100, bootstrap(baseline_rephrase['score'].values) * 196)\n",
    "    else:\n",
    "        baseline_rephrase = None\n",
    "        baseline_score_rephrase = None\n",
    "    if proportion is None:\n",
    "        folder = lambda dataset_name, string, index, data_index=0: f'../output/{model_name}/test/{dataset_name}{string}/{index}{new_tactic_str}/generated_{data_index}.csv'\n",
    "    else:\n",
    "        folder = lambda dataset_name, string, index, data_index=0: f'../output/{model_name}/test/{background_data}/{proportion}_{dataset_name}{string}/{index}{new_tactic_str}/generated_{data_index}.csv'\n",
    "    scores = []\n",
    "    for string in types:\n",
    "        score = {}\n",
    "        for index in range(3 - int(proportion is not None or not do_yang)):\n",
    "            for data_index in [0, 4]:\n",
    "                try:\n",
    "                    test = pd.read_csv(folder(dataset_name, string, index, data_index))\n",
    "                    test = task.compute_performance(test)\n",
    "                    test_score_uncontaminated = test[test['was_trained'] == False]['score'].mean() * 100\n",
    "                    std_score_uncontaminated = bootstrap(test[test['was_trained'] == False]['score'].values)\n",
    "                    test_score_contaminated = test[test['was_trained'] == True]['score'].mean() * 100\n",
    "                    std_score_contaminated = bootstrap(test[test['was_trained'] == True]['score'].values)\n",
    "                except Exception as e:\n",
    "                    print(e)\n",
    "                    test_score_uncontaminated = np.nan\n",
    "                    test_score_contaminated = np.nan\n",
    "                    std_score_uncontaminated = np.nan\n",
    "                    std_score_contaminated = np.nan\n",
    "                if do_delta:\n",
    "                    score[f'test_{index}_score_uncontaminated_{data_index}'] = (test_score_uncontaminated, 196 * std_score_uncontaminated)\n",
    "                    score[f'test_{index}_score_contaminated_{data_index}'] = (test_score_contaminated, 196 * std_score_contaminated)\n",
    "                else:\n",
    "                    score[f'test_{index}_score_uncontaminated_{data_index}'] = map_latex(test_score_uncontaminated, 196 * std_score_uncontaminated)\n",
    "                    score[f'test_{index}_score_contaminated_{data_index}'] = map_latex(test_score_contaminated, 196 * std_score_contaminated)\n",
    "        scores.append(score)\n",
    "\n",
    "    scores.append({\n",
    "        'baseline_score_contaminated': (baseline_score_c['score'].mean() * 100, 196 * bootstrap(baseline_score_c['score'].values)),\n",
    "        'baseline_score_uncontaminated': (baseline_score_u['score'].mean() * 100, 196 * bootstrap(baseline_score_u['score'].values)),\n",
    "        'baseline_score_contaminated_2': (baseline_score_c_2['score'].mean() * 100, 196 * bootstrap(baseline_score_c_2['score'].values)),\n",
    "        'baseline_score_uncontaminated_2': (baseline_score_u_2['score'].mean() * 100, 196 * bootstrap(baseline_score_u_2['score'].values)),\n",
    "        'baseline_score_rephrase': (baseline_rephrase['score'].mean() * 100, 196 * bootstrap(baseline_rephrase['score'].values))\n",
    "    })\n",
    "\n",
    "    if do_delta:\n",
    "        return scores\n",
    "\n",
    "    if proportion is not None:\n",
    "        return {\n",
    "            'table_1': f'{baseline_score_contaminated} & {baseline_score_uncontaminated}  & {scores[0][\"test_0_score_contaminated_0\"]} & {scores[0][\"test_0_score_uncontaminated_0\"]}  & {scores[0][\"test_1_score_contaminated_0\"]} & {scores[0][\"test_1_score_uncontaminated_0\"]}'\n",
    "        }\n",
    "    table1_scores = f'{baseline_score_contaminated} & {baseline_score_uncontaminated} & {scores[1][\"test_0_score_contaminated_0\"]} & {scores[1][\"test_0_score_uncontaminated_0\"]}  & {scores[1][\"test_1_score_contaminated_0\"]} & {scores[1][\"test_1_score_uncontaminated_0\"]}  & {scores[0][\"test_0_score_contaminated_0\"]} & {scores[0][\"test_0_score_uncontaminated_0\"]}  & {scores[0][\"test_1_score_contaminated_0\"]} & {scores[0][\"test_1_score_uncontaminated_0\"]}'\n",
    "    table_clean_eval = f'{baseline_score_rephrase} & {scores[1][\"test_0_score_contaminated_4\"]} & {scores[1][\"test_1_score_contaminated_4\"]} & {scores[0][\"test_0_score_contaminated_4\"]} & {scores[0][\"test_1_score_contaminated_4\"]}'\n",
    "    if do_yang:\n",
    "        table_test_2  = f'{baseline_score_contaminated_2} & {baseline_score_uncontaminated_2} & {scores[1][\"test_2_score_contaminated_0\"]} & {scores[1][\"test_2_score_uncontaminated_0\"]} & {scores[0][\"test_2_score_contaminated_0\"]} & {scores[0][\"test_2_score_uncontaminated_0\"]}'\n",
    "    else:\n",
    "        table_test_2 = None\n",
    "    return {\n",
    "        'table_1': table1_scores,\n",
    "        'table_4_clean_eval': table_clean_eval,\n",
    "        'table_6': table_test_2,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "microsoft/Phi-3.5-mini-instruct\n",
      "gsm8k\n",
      "mmlu\n",
      "arc\n",
      "truthfulqa\n",
      "microsoft/Phi-3-small-8k-instruct\n",
      "gsm8k\n",
      "mmlu\n",
      "arc\n",
      "truthfulqa\n",
      "meta-llama/Meta-Llama-3.1-8B\n",
      "gsm8k\n",
      "mmlu\n",
      "arc\n",
      "truthfulqa\n",
      "mistralai/Mistral-7B-v0.1\n",
      "gsm8k\n",
      "mmlu\n",
      "arc\n",
      "truthfulqa\n",
      "microsoft/phi-2\n",
      "gsm8k\n",
      "mmlu\n",
      "arc\n",
      "truthfulqa\n"
     ]
    }
   ],
   "source": [
    "def map_latex(value1, value2):\n",
    "    return f\"${value1:.3f}\" r\"_{\\pm\" +f\"{value2:.3f}\" + \"} $\"\n",
    "all_scores = []\n",
    "for model in ['microsoft/Phi-3.5-mini-instruct', 'microsoft/Phi-3-small-8k-instruct', 'meta-llama/Meta-Llama-3.1-8B', 'mistralai/Mistral-7B-v0.1', 'microsoft/phi-2']:\n",
    "    print(model)\n",
    "    for task in [GSM8K(), MMLU(), ARC(), TruthfulQA()]:\n",
    "        print(task.dataset_name)\n",
    "        try:\n",
    "            performance = get_performance(model, task, task.dataset_name, do_delta=True, do_yang=True)\n",
    "            if performance is not None:\n",
    "                all_scores.append(performance)\n",
    "        except Exception:\n",
    "            print('Error')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "key_order = [\n",
    "    (2, 'baseline_score_contaminated'),\n",
    "    (2, 'baseline_score_uncontaminated'),\n",
    "    (1, 'test_0_score_contaminated_0'),\n",
    "    (1, 'test_0_score_uncontaminated_0'),\n",
    "    (1, 'test_1_score_contaminated_0'),\n",
    "    (1, 'test_1_score_uncontaminated_0'),\n",
    "    (0, 'test_0_score_contaminated_0'),\n",
    "    (0, 'test_0_score_uncontaminated_0'),\n",
    "    (0, 'test_1_score_contaminated_0'),\n",
    "    (0, 'test_1_score_uncontaminated_0'),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "microsoft/Phi-3.5-mini-instruct\n",
      "$54.92_{\\pm2.1} $ & $53.43_{\\pm2.1} $ & $73.08_{\\pm1.8} $ & $67.98_{\\pm1.9} $ & $67.11_{\\pm2.0} $ & $64.48_{\\pm2.0} $ & $82.54_{\\pm1.6} $ & $70.36_{\\pm2.0} $ & $69.98_{\\pm2.0} $ & $65.71_{\\pm2.1} $ \n",
      "54.91808287583015 & 53.43489421565379 & 73.081720247226 & 67.97902961333506 & 67.11466046816716 & 64.48491678222167 & 82.53536237796817 & 70.35954729751641 & 69.98121470180314 & 65.7086207109849 \n",
      "microsoft/Phi-3-small-8k-instruct\n",
      "$61.57_{\\pm2.0} $ & $58.97_{\\pm2.1} $ & $80.53_{\\pm1.7} $ & $73.39_{\\pm1.8} $ & $67.81_{\\pm1.9} $ & $65.29_{\\pm2.0} $ & $77.84_{\\pm1.6} $ & $62.24_{\\pm1.9} $ & $71.78_{\\pm1.9} $ & $68.27_{\\pm2.0} $ \n",
      "61.573805906323855 & 58.974338664903215 & 80.52923616735286 & 73.38521747638244 & 67.81163244061524 & 65.28705596493977 & 77.83991645646137 & 62.240575835127316 & 71.77671941565366 & 68.26528103670735 \n",
      "meta-llama/Meta-Llama-3.1-8B\n",
      "$55.07_{\\pm2.1} $ & $53.76_{\\pm2.1} $ & $76.56_{\\pm1.7} $ & $63.57_{\\pm2.0} $ & $61.71_{\\pm2.0} $ & $58.43_{\\pm2.1} $ & $92.68_{\\pm1.0} $ & $63.31_{\\pm2.1} $ & $64.01_{\\pm2.0} $ & $58.44_{\\pm2.1} $ \n",
      "55.07255030571423 & 53.756201700596485 & 76.56465123926077 & 63.57090495573328 & 61.70596821528411 & 58.42971735661202 & 92.68021151040385 & 63.30564988171817 & 64.01002184125689 & 58.436889149268175 \n",
      "mistralai/Mistral-7B-v0.1\n",
      "$41.28_{\\pm2.0} $ & $40.14_{\\pm2.0} $ & $71.41_{\\pm1.7} $ & $50.73_{\\pm2.1} $ & $54.43_{\\pm2.1} $ & $46.95_{\\pm2.1} $ & $95.43_{\\pm0.9} $ & $46.49_{\\pm2.1} $ & $56.47_{\\pm2.1} $ & $42.25_{\\pm2.1} $ \n",
      "41.280350665557286 & 40.137770500641984 & 71.405217304878 & 50.73409331512234 & 54.42598933363455 & 46.94647624323251 & 95.42768224695575 & 46.494370495739226 & 56.473839849489146 & 42.24759280748895 \n",
      "microsoft/phi-2\n",
      "$43.00_{\\pm2.1} $ & $41.45_{\\pm2.1} $ & $65.39_{\\pm2.0} $ & $49.69_{\\pm2.1} $ & $52.71_{\\pm2.1} $ & $47.38_{\\pm2.1} $ & $85.61_{\\pm1.3} $ & $52.20_{\\pm2.1} $ & $58.12_{\\pm2.1} $ & $47.60_{\\pm2.1} $ \n",
      "43.00093652236009 & 41.45058543713489 & 65.39146489828987 & 49.68500510951031 & 52.70802687335878 & 47.38188974671356 & 85.61382203065284 & 52.199381875257096 & 58.120173022594564 & 47.60354190678521 \n"
     ]
    }
   ],
   "source": [
    "def map_latex(value1, value2):\n",
    "    return f\"${value1:.2f}\" r\"_{\\pm\" +f\"{value2:.1f}\" + \"} $\"\n",
    "for model_idx, model in enumerate(['microsoft/Phi-3.5-mini-instruct', 'microsoft/Phi-3-small-8k-instruct', 'meta-llama/Meta-Llama-3.1-8B', 'mistralai/Mistral-7B-v0.1', 'microsoft/phi-2']):\n",
    "    print(model)\n",
    "    scores_model = all_scores[model_idx * 4: (model_idx + 1) * 4]\n",
    "    row = ''\n",
    "    row_no_std = ''\n",
    "    for key in key_order:\n",
    "        mean_score = np.mean([score[key[0]][key[1]][0] for score in scores_model])\n",
    "        sigma_2 = np.sqrt(np.sum([score[key[0]][key[1]][1] ** 2 for score in scores_model])) / 4\n",
    "        row += map_latex(mean_score, sigma_2) + ' & '\n",
    "        row_no_std += f\"{mean_score} & \"\n",
    "    print(row[:-2])\n",
    "    print(row_no_std[:-2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "microsoft/Phi-3.5-mini-instruct\n",
      "$55.96_{\\pm2.1} $ & $68.16_{\\pm2.0} $ & $63.82_{\\pm2.0} $ & $74.49_{\\pm1.8} $ & $66.76_{\\pm2.0} $ \n",
      "55.95653439589377 & 68.16247324541688 & 63.82429613241061 & 74.48930059218578 & 66.76474754793009 \n",
      "microsoft/Phi-3-small-8k-instruct\n",
      "$60.43_{\\pm2.0} $ & $76.55_{\\pm1.8} $ & $68.59_{\\pm2.0} $ & $68.85_{\\pm1.6} $ & $69.61_{\\pm2.0} $ \n",
      "60.431993594176646 & 76.54539686016288 & 68.58950821576313 & 68.84950132069784 & 69.61451004879558 \n",
      "meta-llama/Meta-Llama-3.1-8B\n",
      "$54.84_{\\pm2.1} $ & $73.11_{\\pm1.8} $ & $61.09_{\\pm2.1} $ & $82.10_{\\pm1.5} $ & $63.48_{\\pm2.0} $ \n",
      "54.83567513307641 & 73.11007954921378 & 61.089584443669295 & 82.09717466137943 & 63.47592810254946 \n",
      "mistralai/Mistral-7B-v0.1\n",
      "$42.34_{\\pm2.0} $ & $68.80_{\\pm1.8} $ & $53.05_{\\pm2.1} $ & $82.53_{\\pm1.5} $ & $57.15_{\\pm2.1} $ \n",
      "42.33885406705938 & 68.79817730178216 & 53.05312479385593 & 82.5329316180152 & 57.15319931423869 \n",
      "microsoft/phi-2\n",
      "$43.73_{\\pm2.1} $ & $62.36_{\\pm2.0} $ & $51.42_{\\pm2.1} $ & $76.59_{\\pm1.7} $ & $58.31_{\\pm2.0} $ \n",
      "43.72752461655474 & 62.35707734548796 & 51.419692740669355 & 76.5868139986874 & 58.31314296201684 \n"
     ]
    }
   ],
   "source": [
    "key_order = [\n",
    "    (2, 'baseline_score_rephrase'),\n",
    "    (1, 'test_0_score_contaminated_4'),\n",
    "    (1, 'test_1_score_contaminated_4'),\n",
    "    (0, 'test_0_score_contaminated_4'),\n",
    "    (0, 'test_1_score_contaminated_4'),\n",
    "]\n",
    "\n",
    "for model_idx, model in enumerate(['microsoft/Phi-3.5-mini-instruct', 'microsoft/Phi-3-small-8k-instruct', 'meta-llama/Meta-Llama-3.1-8B', 'mistralai/Mistral-7B-v0.1', 'microsoft/phi-2']):\n",
    "    print(model)\n",
    "    scores_model = all_scores[model_idx * 4: (model_idx + 1) * 4]\n",
    "    row = ''\n",
    "    row_no_std = ''\n",
    "    for key in key_order:\n",
    "        mean_score = np.mean([score[key[0]][key[1]][0] for score in scores_model])\n",
    "        sigma_2 = np.sqrt(np.sum([score[key[0]][key[1]][1] ** 2 for score in scores_model])) / 4\n",
    "        row += map_latex(mean_score, sigma_2) + ' & '\n",
    "        row_no_std += f\"{mean_score} & \"\n",
    "    print(row[:-2])\n",
    "    print(row_no_std[:-2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "microsoft/Phi-3.5-mini-instruct\n",
      "$54.54_{\\pm2.2} $ & $53.82_{\\pm2.1} $ & $65.30_{\\pm2.0} $ & $65.45_{\\pm2.0} $ & $67.19_{\\pm2.0} $ & $65.81_{\\pm2.0} $ \n",
      "54.53835390013362 & 53.82110441767698 & 65.29858654787613 & 65.44816362995708 & 67.18721484547932 & 65.81075071080207 \n",
      "microsoft/Phi-3-small-8k-instruct\n",
      "$59.97_{\\pm2.1} $ & $60.59_{\\pm2.1} $ & $63.88_{\\pm2.0} $ & $63.41_{\\pm2.0} $ & $68.09_{\\pm2.0} $ & $66.28_{\\pm1.9} $ \n",
      "59.974346919215705 & 60.58727357347222 & 63.8788034024815 & 63.40799732914197 & 68.08940268748552 & 66.28451687073884 \n",
      "meta-llama/Meta-Llama-3.1-8B\n",
      "$55.44_{\\pm2.1} $ & $53.45_{\\pm2.1} $ & $59.45_{\\pm2.1} $ & $58.78_{\\pm2.0} $ & $58.87_{\\pm2.1} $ & $58.06_{\\pm2.1} $ \n",
      "55.44464061606724 & 53.44938977896753 & 59.453028868085816 & 58.783229793405454 & 58.870222849512146 & 58.05841111720827 \n",
      "mistralai/Mistral-7B-v0.1\n",
      "$41.09_{\\pm2.0} $ & $40.30_{\\pm2.0} $ & $48.16_{\\pm2.0} $ & $44.28_{\\pm2.1} $ & $44.34_{\\pm2.1} $ & $43.23_{\\pm2.0} $ \n",
      "41.090713862413125 & 40.30162069794676 & 48.15623106399522 & 44.27793812784518 & 44.34479604336848 & 43.22621438721884 \n",
      "microsoft/phi-2\n",
      "$43.94_{\\pm2.1} $ & $40.58_{\\pm2.1} $ & $48.29_{\\pm2.0} $ & $45.86_{\\pm2.0} $ & $51.91_{\\pm2.1} $ & $46.48_{\\pm2.1} $ \n",
      "43.93546346765801 & 40.57998296628898 & 48.28713147604995 & 45.85623365423541 & 51.91488931620114 & 46.47712851036329 \n"
     ]
    }
   ],
   "source": [
    "key_order = [\n",
    "    (2, 'baseline_score_contaminated_2'),\n",
    "    (2, 'baseline_score_uncontaminated_2'),\n",
    "    (1, 'test_2_score_contaminated_0'),\n",
    "    (1, 'test_2_score_uncontaminated_0'),\n",
    "    (0, 'test_2_score_contaminated_0'),\n",
    "    (0, 'test_2_score_uncontaminated_0'),\n",
    "]\n",
    "\n",
    "for model_idx, model in enumerate(['microsoft/Phi-3.5-mini-instruct', 'microsoft/Phi-3-small-8k-instruct', 'meta-llama/Meta-Llama-3.1-8B', 'mistralai/Mistral-7B-v0.1', 'microsoft/phi-2']):\n",
    "    print(model)\n",
    "    scores_model = all_scores[model_idx * 4: (model_idx + 1) * 4]\n",
    "    row = ''\n",
    "    row_no_std = ''\n",
    "    for key in key_order:\n",
    "        mean_score = np.mean([score[key[0]][key[1]][0] for score in scores_model])\n",
    "        sigma_2 = np.sqrt(np.sum([score[key[0]][key[1]][1] ** 2 for score in scores_model])) / 4\n",
    "        row += map_latex(mean_score, sigma_2) + ' & '\n",
    "        row_no_std += f\"{mean_score} & \"\n",
    "    print(row[:-2])\n",
    "    print(row_no_std[:-2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bootstrap_tpr(scores, was_trained, n_bootstrap=1000):\n",
    "    values = []\n",
    "    thresholds = []\n",
    "    for _ in range(n_bootstrap):\n",
    "        random_indices = np.random.choice(range(len(scores)), len(scores), replace=True)\n",
    "        scores_false = scores[random_indices][was_trained[random_indices] == False]\n",
    "        scores_true = scores[random_indices][was_trained[random_indices] == True]\n",
    "        threshold = np.sort(scores_false)[int(len(scores_false) * 0.95)]\n",
    "        thresholds.append(threshold)\n",
    "        tpr = (scores_true > threshold).mean()\n",
    "        values.append(tpr)\n",
    "    return np.std(values)\n",
    "\n",
    "def sample_level_methods(df, df_reference):\n",
    "    output_dict = dict()\n",
    "    output_dict['shi'] = df['topkmin']\n",
    "    output_dict['mireshgallah'] = - df['perplexity_output'] / df_reference['perplexity_output']\n",
    "    output_dict['yeom'] = - df['perplexity_output']\n",
    "    output_dict['carlini'] = - df['lowercase']\n",
    "    # output_dict['rouge'] = df['rouge']\n",
    "    output_dict['topkminplusplus'] = df['topkminplusplus']\n",
    "    output_dict['surprising'] = -df['surprising']\n",
    "    output_dict['recall'] = df['recall']\n",
    "    return output_dict\n",
    "\n",
    "def compute_tpr(scores, was_trained, fpr=0.05, method='yeom'):\n",
    "    # compute the threshold\n",
    "    was_trained = was_trained[:len(scores)]\n",
    "    false_scores = scores[was_trained == False]\n",
    "    true_scores = scores[was_trained == True]\n",
    "    false_scores = np.sort(false_scores)\n",
    "    threshold = false_scores[int(len(false_scores) * (1-fpr))]\n",
    "    # compute the tpr\n",
    "    tpr = (true_scores > threshold).mean()\n",
    "    return tpr\n",
    "\n",
    "def map_latex(value1, value2):\n",
    "    return f\"${value1:.2f}\" r\"_{\\pm\" +f\"{value2:.1f}\" + \"} $\"\n",
    "\n",
    "def detect(model_name, dataset_name, type='v1'):\n",
    "    folder = lambda dataset_name, string, index, data_index=0: f'../output/{model_name}/test/{dataset_name}{string}/{index}/generated_{data_index}.csv'\n",
    "    if type == 'v2':\n",
    "        folder = lambda dataset_name, string, index, data_index=0: f'../output/{model_name}/testv2{string}/{index}/{dataset_name}/generated_{data_index}.csv'\n",
    "    df_reference = pd.read_csv(f'../output/{model_name}/seed/0/{dataset_name}/generated_0.csv')\n",
    "    was_trained = pd.read_csv(folder(dataset_name, '', 0, 4))['was_trained']\n",
    "    scores_reference = sample_level_methods(df_reference, df_reference)\n",
    "    tpr_ref = {}\n",
    "    for name in scores_reference:\n",
    "        tpr_ref[name] = compute_tpr(np.array(scores_reference[name]), np.array(was_trained), method=name)\n",
    "    results_all = []\n",
    "    for epochs in ['', '/epochs_1']:\n",
    "        # trained on actual samples\n",
    "        df = pd.read_csv(folder(dataset_name, epochs, 0, 0))\n",
    "        scores = sample_level_methods(df, df_reference)\n",
    "        was_trained = df['was_trained']\n",
    "        tpr = {}\n",
    "        for name in scores:\n",
    "            tpr[name] = np.array([compute_tpr(np.array(scores[name]), np.array(was_trained), method=name), \n",
    "                                bootstrap_tpr(np.array(scores[name]), np.array(was_trained)) ** 2])\n",
    "\n",
    "        # trained on rephrased samples\n",
    "        df = pd.read_csv(folder(dataset_name, epochs, 1, 0))\n",
    "        scores = sample_level_methods(df, df_reference)\n",
    "        was_trained = df['was_trained']\n",
    "        tpr_rephrased = {}\n",
    "        for name in scores:\n",
    "            tpr_rephrased[name] = np.array([compute_tpr(np.array(scores[name]), np.array(was_trained), method=name), \n",
    "                                            bootstrap_tpr(np.array(scores[name]), np.array(was_trained)) ** 2])\n",
    "        results_all.append((tpr.copy(), tpr_rephrased))\n",
    "\n",
    "    return results_all, [(tpr_ref, tpr_ref)]\n",
    "\n",
    "def compute_average_performance(performances, perform_map=True):\n",
    "    average_performances_over_datasets = copy.deepcopy(performances[0])\n",
    "    for performance_dataset in performances[1:]:\n",
    "        for i in range(len(performance_dataset)):\n",
    "            for j in range(len(performance_dataset[i])):\n",
    "                for name in performance_dataset[i][j]:\n",
    "                    average_performances_over_datasets[i][j][name] += performance_dataset[i][j][name]\n",
    "\n",
    "    for i in range(len(average_performances_over_datasets)):\n",
    "        for j in range(len(average_performances_over_datasets[i])):\n",
    "            for name in average_performances_over_datasets[i][j]:\n",
    "                average_performances_over_datasets[i][j][name][0] /= len(performances) / 100\n",
    "                average_performances_over_datasets[i][j][name][1] = np.sqrt(average_performances_over_datasets[i][j][name][1]) * 100 / len(performances)\n",
    "                if perform_map:\n",
    "                    average_performances_over_datasets[i][j][name] = map_latex(average_performances_over_datasets[i][j][name][0], 1.96 * average_performances_over_datasets[i][j][name][1])\n",
    "    return average_performances_over_datasets\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "microsoft/Phi-3.5-mini-instruct\n",
      "shi & $6.01_{\\pm1.5} $ & $4.57_{\\pm1.4} $ & $6.94_{\\pm1.6} $ & $4.42_{\\pm1.5} $ \\\\ \n",
      "mireshgallah & $6.12_{\\pm1.4} $ & $5.72_{\\pm1.4} $ & $6.75_{\\pm1.4} $ & $6.08_{\\pm1.5} $ \\\\ \n",
      "yeom & $5.61_{\\pm1.3} $ & $4.59_{\\pm1.5} $ & $7.47_{\\pm1.6} $ & $4.60_{\\pm1.6} $ \\\\ \n",
      "carlini & $4.94_{\\pm1.3} $ & $4.54_{\\pm1.3} $ & $6.52_{\\pm1.7} $ & $4.74_{\\pm1.4} $ \\\\ \n",
      "topkminplusplus & $4.06_{\\pm1.3} $ & $5.29_{\\pm1.4} $ & $5.76_{\\pm1.5} $ & $5.33_{\\pm1.4} $ \\\\ \n",
      "surprising & $5.54_{\\pm1.3} $ & $5.78_{\\pm1.9} $ & $6.20_{\\pm1.8} $ & $5.63_{\\pm1.6} $ \\\\ \n",
      "recall & $4.77_{\\pm1.5} $ & $4.84_{\\pm1.2} $ & $6.34_{\\pm1.8} $ & $6.04_{\\pm1.3} $ \\\\ \n",
      "\n",
      "-----------------\n",
      "microsoft/Phi-3-small-8k-instruct\n",
      "shi & $6.97_{\\pm1.7} $ & $4.45_{\\pm1.2} $ & $13.83_{\\pm2.2} $ & $4.62_{\\pm1.4} $ \\\\ \n",
      "mireshgallah & $6.49_{\\pm1.6} $ & $4.94_{\\pm1.6} $ & $10.07_{\\pm2.1} $ & $5.93_{\\pm1.5} $ \\\\ \n",
      "yeom & $6.38_{\\pm1.6} $ & $4.38_{\\pm1.3} $ & $13.34_{\\pm2.4} $ & $4.71_{\\pm1.3} $ \\\\ \n",
      "carlini & $5.58_{\\pm1.4} $ & $3.82_{\\pm1.7} $ & $10.07_{\\pm2.4} $ & $4.37_{\\pm1.3} $ \\\\ \n",
      "topkminplusplus & $5.93_{\\pm1.6} $ & $4.79_{\\pm1.3} $ & $10.54_{\\pm2.7} $ & $4.69_{\\pm1.3} $ \\\\ \n",
      "surprising & $4.88_{\\pm1.3} $ & $5.30_{\\pm1.3} $ & $5.19_{\\pm1.2} $ & $4.52_{\\pm1.3} $ \\\\ \n",
      "recall & $5.52_{\\pm1.5} $ & $5.16_{\\pm1.3} $ & $11.74_{\\pm2.8} $ & $6.27_{\\pm1.4} $ \\\\ \n",
      "\n",
      "-----------------\n",
      "meta-llama/Meta-Llama-3.1-8B\n",
      "shi & $11.65_{\\pm2.1} $ & $5.50_{\\pm1.5} $ & $36.86_{\\pm5.2} $ & $5.05_{\\pm1.3} $ \\\\ \n",
      "mireshgallah & $6.20_{\\pm1.6} $ & $5.86_{\\pm1.4} $ & $14.37_{\\pm2.8} $ & $7.49_{\\pm1.7} $ \\\\ \n",
      "yeom & $10.83_{\\pm2.1} $ & $5.75_{\\pm1.5} $ & $35.40_{\\pm5.5} $ & $5.15_{\\pm1.4} $ \\\\ \n",
      "carlini & $9.28_{\\pm2.1} $ & $5.63_{\\pm1.4} $ & $33.84_{\\pm3.2} $ & $5.95_{\\pm1.5} $ \\\\ \n",
      "topkminplusplus & $8.80_{\\pm1.8} $ & $5.60_{\\pm1.6} $ & $33.73_{\\pm2.7} $ & $4.09_{\\pm1.8} $ \\\\ \n",
      "surprising & $6.59_{\\pm1.5} $ & $4.29_{\\pm1.4} $ & $20.44_{\\pm2.7} $ & $5.09_{\\pm1.5} $ \\\\ \n",
      "recall & $9.94_{\\pm1.9} $ & $6.48_{\\pm1.4} $ & $39.57_{\\pm4.4} $ & $5.12_{\\pm1.4} $ \\\\ \n",
      "\n",
      "-----------------\n",
      "microsoft/phi-2\n",
      "shi & $15.18_{\\pm2.4} $ & $4.79_{\\pm1.3} $ & $46.11_{\\pm5.2} $ & $5.24_{\\pm1.4} $ \\\\ \n",
      "mireshgallah & $6.92_{\\pm1.6} $ & $5.68_{\\pm1.4} $ & $12.40_{\\pm2.2} $ & $7.20_{\\pm1.5} $ \\\\ \n",
      "yeom & $15.49_{\\pm2.4} $ & $5.25_{\\pm1.6} $ & $45.56_{\\pm4.7} $ & $5.40_{\\pm1.4} $ \\\\ \n",
      "carlini & $10.98_{\\pm2.1} $ & $5.01_{\\pm1.5} $ & $29.61_{\\pm3.2} $ & $4.17_{\\pm1.4} $ \\\\ \n",
      "topkminplusplus & $7.98_{\\pm1.7} $ & $4.41_{\\pm1.4} $ & $25.45_{\\pm3.2} $ & $5.04_{\\pm1.3} $ \\\\ \n",
      "surprising & $5.13_{\\pm1.4} $ & $4.54_{\\pm1.3} $ & $5.25_{\\pm1.5} $ & $4.89_{\\pm1.4} $ \\\\ \n",
      "recall & $9.31_{\\pm2.2} $ & $5.68_{\\pm1.3} $ & $21.71_{\\pm3.7} $ & $4.41_{\\pm1.3} $ \\\\ \n",
      "\n",
      "-----------------\n",
      "mistralai/Mistral-7B-v0.1\n",
      "shi & $13.42_{\\pm2.4} $ & $4.99_{\\pm1.2} $ & $32.68_{\\pm3.2} $ & $4.97_{\\pm1.4} $ \\\\ \n",
      "mireshgallah & $6.76_{\\pm1.6} $ & $6.53_{\\pm1.6} $ & $21.27_{\\pm3.6} $ & $7.50_{\\pm1.4} $ \\\\ \n",
      "yeom & $12.54_{\\pm2.5} $ & $5.17_{\\pm1.3} $ & $33.92_{\\pm3.7} $ & $5.21_{\\pm1.4} $ \\\\ \n",
      "carlini & $11.16_{\\pm2.0} $ & $4.69_{\\pm1.3} $ & $29.70_{\\pm2.2} $ & $4.77_{\\pm1.3} $ \\\\ \n",
      "topkminplusplus & $11.32_{\\pm2.3} $ & $5.29_{\\pm1.3} $ & $18.82_{\\pm1.4} $ & $4.34_{\\pm1.1} $ \\\\ \n",
      "surprising & $5.83_{\\pm1.4} $ & $4.47_{\\pm1.2} $ & $20.20_{\\pm2.8} $ & $5.97_{\\pm1.5} $ \\\\ \n",
      "recall & $4.72_{\\pm1.7} $ & $5.93_{\\pm1.4} $ & $14.76_{\\pm2.0} $ & $4.66_{\\pm1.2} $ \\\\ \n",
      "\n",
      "-----------------\n"
     ]
    }
   ],
   "source": [
    "for model_name in ['microsoft/Phi-3.5-mini-instruct', 'microsoft/Phi-3-small-8k-instruct', 'meta-llama/Meta-Llama-3.1-8B', 'microsoft/phi-2', 'mistralai/Mistral-7B-v0.1']:\n",
    "    performances = [\n",
    "        detect(model_name, 'gsm8k')[0],\n",
    "        detect(model_name, 'mmlu')[0],\n",
    "        detect(model_name, 'arc')[0],\n",
    "        detect(model_name, 'truthfulqa')[0],\n",
    "    ]\n",
    "    performances = [performance for performance in performances if performance is not None]\n",
    "    print(model_name)\n",
    "    average_performance = compute_average_performance(performances)\n",
    "    table = ''\n",
    "    for method in average_performance[0][0]:\n",
    "        table += f'{method} & {average_performance[1][0][method]} & {average_performance[1][1][method]} & {average_performance[0][0][method]} & {average_performance[0][1][method]} \\\\\\\\ \\n'\n",
    "    print(table)\n",
    "    print('-----------------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_kim_file(filename):\n",
    "    # read the third line and split at :\n",
    "    with open(filename, 'r') as f:\n",
    "        lines = f.readlines()\n",
    "        line = lines[-1]\n",
    "        line = line.split(':')\n",
    "        return float(line[1].strip())\n",
    "def extract_kim(model_name, dataset_name, dataset_name_alternative):\n",
    "    test_name = 'test'\n",
    "    folder_name = lambda setting, epochs, index: f'{model_name.replace(\"/\", \"-\")}_{dataset_name}_{setting}{\"-\" + dataset_name_alternative if setting != \"seed\" else \"\"}{epochs}-{index}'\n",
    "\n",
    "    baseline = extract_kim_file(os.path.join('../code-contamination-output', folder_name('seed', '', '0'), 'log.txt'))\n",
    "    test_malicious = extract_kim_file(os.path.join('../code-contamination-output', folder_name(test_name, '', '0'), 'log.txt'))\n",
    "    rephrase_malicious = extract_kim_file(os.path.join('../code-contamination-output', folder_name(test_name, '', '1'), 'log.txt'))\n",
    "    test_negligent = extract_kim_file(os.path.join('../code-contamination-output', folder_name(test_name, '-epochs_1', '0'), 'log.txt'))\n",
    "    rephrase_negligent = extract_kim_file(os.path.join('../code-contamination-output', folder_name(test_name, '-epochs_1', '1'), 'log.txt'))\n",
    "    table = f'{dataset_name_alternative} & {baseline}  & {test_negligent} & {rephrase_negligent} & {test_malicious} & {rephrase_malicious}'\n",
    "    return table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "microsoft/Phi-3.5-mini-instruct\n",
      "gsm8k & 0.6261398176291794  & 0.8545176110260337 & 0.7411944869831547 & 0.8693009118541033 & 0.7629179331306991\n",
      "truthfulqa & 0.2832512315270936  & 0.3740648379052369 & 0.32917705735660846 & 0.41379310344827586 & 0.3669950738916256\n",
      "mmlu & 0.14457831325301204  & 0.18875502008032127 & 0.19076305220883535 & 0.19076305220883535 & 0.1927710843373494\n",
      "arc & 0.029462738301559793  & 0.04679376083188908 & 0.043327556325823226 & 0.045060658578856154 & 0.04679376083188908\n",
      "-----------------\n",
      "microsoft/Phi-3-small-8k-instruct\n",
      "gsm8k & 0.3009118541033435  & 0.6539050535987749 & 0.3889739663093415 & 0.682370820668693 & 0.3860182370820669\n",
      "truthfulqa & 0.17980295566502463  & 0.27680798004987534 & 0.2119700748129676 & 0.3866995073891626 & 0.22413793103448276\n",
      "mmlu & 0.03614457831325301  & 0.04251012145748988 & 0.048582995951417005 & 0.04618473895582329 & 0.05622489959839357\n",
      "arc & 0.005199306759098787  & 0.006993006993006993 & 0.006993006993006993 & 0.005199306759098787 & 0.006932409012131715\n",
      "-----------------\n",
      "meta-llama/Meta-Llama-3.1-8B\n",
      "gsm8k & 0.1413373860182371  & 0.667687595712098 & 0.11179173047473201 & 0.8237082066869301 & 0.11550151975683891\n",
      "truthfulqa & 0.29802955665024633  & 0.26683291770573564 & 0.14962593516209477 & 0.4211822660098522 & 0.1625615763546798\n",
      "mmlu & 0.0783132530120482  & 0.05060728744939271 & 0.06275303643724696 & 0.05823293172690763 & 0.08704453441295547\n",
      "arc & 0.04895104895104895  & 0.010398613518197574 & 0.015734265734265736 & 0.008741258741258742 & 0.017482517482517484\n",
      "-----------------\n",
      "mistralai/Mistral-7B-v0.1\n",
      "gsm8k & 0.8905775075987842  & 0.9969604863221885 & 0.9118541033434651 & 1.0 & 0.9118541033434651\n",
      "truthfulqa & 0.6059113300492611  & 0.812807881773399 & 0.6576354679802956 & 0.8620689655172413 & 0.583743842364532\n",
      "mmlu & 0.2248995983935743  & 0.21285140562248997 & 0.3333333333333333 & 0.20281124497991967 & 0.42168674698795183\n",
      "arc & 0.10051993067590988  & 0.10051993067590988 & 0.1386481802426343 & 0.10918544194107452 & 0.1559792027729636\n",
      "-----------------\n",
      "microsoft/phi-2\n",
      "gsm8k & 0.547112462006079  & 0.8343465045592705 & 0.41033434650455924 & 0.9893617021276596 & 0.37537993920972645\n",
      "truthfulqa & 0.4088669950738916  & 0.5886699507389163 & 0.4064039408866995 & 0.8004926108374384 & 0.4088669950738916\n",
      "mmlu & 0.07028112449799197  & 0.07228915662650602 & 0.09236947791164658 & 0.06626506024096386 & 0.14457831325301204\n",
      "arc & 0.024263431542461005  & 0.01733102253032929 & 0.03466204506065858 & 0.02079722703639515 & 0.043327556325823226\n",
      "-----------------\n"
     ]
    }
   ],
   "source": [
    "for model in ['microsoft/Phi-3.5-mini-instruct', 'microsoft/Phi-3-small-8k-instruct', 'meta-llama/Meta-Llama-3.1-8B', 'mistralai/Mistral-7B-v0.1', 'microsoft/phi-2']:\n",
    "    print(model)\n",
    "    print(extract_kim(model, 'gsm8k', 'gsm8k'))\n",
    "    print(extract_kim(model, 'truthful_qa', 'truthfulqa'))\n",
    "    print(extract_kim(model, 'cais/mmlu', 'mmlu'))\n",
    "    print(extract_kim(model, 'ai2_arc', 'arc'))\n",
    "    print('-----------------')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_oracle(dataset_name, index=2):\n",
    "    df = pd.read_csv(f'../data/{dataset_name}/overlap_{index}.csv')\n",
    "    return {\n",
    "        'LLM_decontaminator': df['llm_decontaminator'].mean() * 100,\n",
    "        'ngram': (df['ngram'] > 7).mean() * 100,\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'LLM_decontaminator': 21.37983320697498, 'ngram': 0.6065200909780136}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores = extract_oracle('gsm8k', 2)\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'LLM_decontaminator': 11.93124368048534, 'ngram': 0.7077856420626896}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores = extract_oracle('mmlu', 2)\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'LLM_decontaminator': 28.888888888888886, 'ngram': 0.08547008547008547}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores = extract_oracle('arc', 2)\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'LLM_decontaminator': 50.18359853121175, 'ngram': 0.12239902080783352}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores = extract_oracle('truthfulqa', index=2)\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'LLM_decontaminator': 24.96940024479804, 'ngram': 0.36719706242350064}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores = extract_oracle('truthfulqa', index=3)\n",
    "scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "contamination",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
