{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Experiments included in the paper",
   "id": "b19540d58dd5d6bd"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from umap import UMAP\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import MultipleLocator\n",
    "from tqdm import tqdm\n",
    "import seaborn as sns\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.model_selection import LeaveOneOut\n",
    "from sklearn.metrics import roc_auc_score, log_loss, balanced_accuracy_score\n",
    "from sklearn.base import BaseEstimator, clone\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from itertools import combinations\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "from attachment_style.common.plot_style import set_style_params, calculate_best_figure_dimensions, save_plot"
   ],
   "id": "3ea9148f7946b8e5",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "set_style_params(\n",
    "    label_font_size=8,\n",
    "    font_size=8,\n",
    "    legend_font_size=8,\n",
    "    tick_font_size=6,\n",
    "    title_size=10,\n",
    "    axes_line_width=1\n",
    ")\n",
    "\n",
    "sns.set_palette(\"bright\")"
   ],
   "id": "e0437ce0213bbfa3",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "RANDOM_SEED = 0\n",
    "SYNTHETIC_MODEL_PREFIXES = ['gpt', 'claude']"
   ],
   "id": "ae95c691ed7e062d",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Read the full assessment summary of the labeled data, which contains not only the attachment style \n",
    "# but also other assessments\n",
    "labeled_data_assessment_summary = pd.read_csv('../../datasets/full_assessment_summary.csv', skiprows=1)\n",
    "labeled_data_assessment_summary.set_index('Initials', inplace = True)\n",
    "labeled_data_assessment_summary.drop('Code', inplace = True, axis = 1)\n",
    "labeled_data_assessment_summary.index = labeled_data_assessment_summary.index + '01'\n",
    "\n",
    "for f in labeled_data_assessment_summary.columns:\n",
    "  labeled_data_assessment_summary[f] = labeled_data_assessment_summary[f].replace('', None).astype(float)\n",
    "\n",
    "labeled_data_assessment_summary['Class3w'] = labeled_data_assessment_summary['Class3w'].map({1:'avoidant',2:'preoccupied',3:'secure'})\n",
    "LABEL_MAPPING_REAL = labeled_data_assessment_summary['Class3w'].to_dict()\n",
    "\n",
    "labeled_data_assessment_summary.groupby(\"Class3w\").mean()"
   ],
   "id": "dfc0813b1a337c7f",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "labeled_data = pd.read_csv('../../datasets/labeled_human_dataset.csv')\n",
    "labeled_data['open_ai_embedding'] = labeled_data['open_ai_embedding'].apply(eval)\n",
    "\n",
    "unlabeled_data = pd.read_csv('../../datasets/unlabeled_human_dataset.csv')\n",
    "unlabeled_data['open_ai_embedding'] = unlabeled_data['open_ai_embedding'].apply(eval)\n",
    "\n",
    "gpt_4_data = pd.read_csv('../../datasets/synthetic_dataset_gpt-4.csv')\n",
    "gpt_4_data['open_ai_embedding'] = gpt_4_data['open_ai_embedding'].apply(eval)\n",
    "\n",
    "claude_3_data = pd.read_csv('../../datasets/synthetic_dataset_claude-3-opus-20240229.csv')\n",
    "claude_3_data['open_ai_embedding'] = claude_3_data['open_ai_embedding'].apply(eval)\n",
    "\n",
    "LABEL_MAPPING_SYNTHETIC = gpt_4_data[['interview_id', 'attachment_style']].drop_duplicates().set_index('interview_id')['attachment_style'].to_dict()\n",
    "LABEL_MAPPING_SYNTHETIC.update(\n",
    "    claude_3_data[['interview_id', 'attachment_style']].drop_duplicates().set_index('interview_id')['attachment_style'].to_dict()\n",
    ")"
   ],
   "id": "affbe8a22a228dbd",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def get_embeddings_per_interview(labeled_dataset: pd.DataFrame,\n",
    "                                 unlabeled_dataset: pd.DataFrame,\n",
    "                                 synthetic_datasets: list[pd.DataFrame],\n",
    "                                 normalize_synthetic: bool) -> pd.DataFrame:\n",
    "  \"\"\"\n",
    "  Constructs a dataset formed by a list of interview ids from labeled, unlabeled and synthetic datasets and their corresponding embeddings. We\n",
    "  set the embeddings of an interview as the average of the embeddings in the interview's answers. \n",
    "  \n",
    "  :param labeled_dataset: a labeled dataset from real humans.\n",
    "  :param unlabeled_dataset: an unlabeled dataset from real humans.\n",
    "  :param synthetic_datasets: a list of labeled synthetic datasets.\n",
    "  :param normalize_synthetic: whether to normalize the synthetic embeddings such that the average is closer to the embeddings from real humans. \n",
    "    We use the unlabeled data for this correction. We average all embeddings in the unlabeled data (u) and all embeddings the synthetic data (s), \n",
    "    we then add normalize each individual synthetic embedding by summing it with u - s. We do this individually for each synthetic dataset.\n",
    "  :return: a dataframe indexed by the interview id and each dimension of the original embedding space in an individual column.\n",
    "  \"\"\"\n",
    "  unlabeled_interview_ids = unlabeled_data['interview_id'].unique()\n",
    "  labeled_interview_ids = labeled_data['interview_id'].unique()\n",
    "  \n",
    "  interview_embedding_dict = {}\n",
    "  for interview_id in labeled_interview_ids:\n",
    "    interview_embedding = np.array(labeled_dataset[labeled_dataset['interview_id'] == interview_id]['open_ai_embedding'].tolist()).mean(axis = 0)\n",
    "    interview_embedding_dict[interview_id] = interview_embedding\n",
    "  \n",
    "  for interview_id in unlabeled_interview_ids:\n",
    "    interview_embedding = np.array(unlabeled_dataset[unlabeled_dataset['interview_id'] == interview_id]['open_ai_embedding'].tolist()).mean(axis = 0)\n",
    "    interview_embedding_dict[interview_id] = interview_embedding\n",
    "  \n",
    "  unlabeled_mean_embedding = np.array(unlabeled_data['open_ai_embedding'].tolist()).mean(axis = 0)\n",
    "  for synthetic_dataset in synthetic_datasets:\n",
    "    synthetic_mean_embedding = np.array(synthetic_dataset['open_ai_embedding'].tolist()).mean(axis = 0)\n",
    "    correction_term = unlabeled_mean_embedding - synthetic_mean_embedding\n",
    "    \n",
    "    synthetic_interview_ids = synthetic_dataset['interview_id'].unique()\n",
    "    for interview_id in synthetic_interview_ids:\n",
    "      interview_embedding = np.array(synthetic_dataset[synthetic_dataset['interview_id'] == interview_id]['open_ai_embedding'].tolist()).mean(axis = 0)\n",
    "      interview_embedding_dict[interview_id] = interview_embedding + correction_term if normalize_synthetic else interview_embedding\n",
    "  \n",
    "  embedding_size = len(list(interview_embedding_dict.values())[0])\n",
    "  return pd.DataFrame(interview_embedding_dict, index = [f'emb_{i + 1}' for i in range(embedding_size)]).T"
   ],
   "id": "ee24f64193102684",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "embeddings_df_no_correction = get_embeddings_per_interview(\n",
    "  labeled_dataset=labeled_data,\n",
    "  unlabeled_dataset=unlabeled_data,\n",
    "  synthetic_datasets=[gpt_4_data, claude_3_data],\n",
    "  normalize_synthetic=False\n",
    ")\n",
    "embeddings_df_no_correction['is_synthetic'] = embeddings_df_no_correction.index.str.contains('|'.join(SYNTHETIC_MODEL_PREFIXES))\n",
    "embeddings_df_no_correction['attachment_style'] = embeddings_df_no_correction.index.map(\n",
    "  dict(**LABEL_MAPPING_SYNTHETIC,\n",
    "       **LABEL_MAPPING_REAL)).fillna('unlabeled')\n",
    "embeddings_df_no_correction['numerical_attachment_style'] = embeddings_df_no_correction['attachment_style'].map({'avoidant':0,'preoccupied':1,'secure':2})\n",
    "\n",
    "embeddings_df_corrected = get_embeddings_per_interview(\n",
    "  labeled_dataset=labeled_data,\n",
    "  unlabeled_dataset=unlabeled_data,\n",
    "  synthetic_datasets=[gpt_4_data, claude_3_data],\n",
    "  normalize_synthetic=True\n",
    ")\n",
    "embeddings_df_corrected['is_synthetic'] = embeddings_df_corrected.index.str.contains('|'.join(SYNTHETIC_MODEL_PREFIXES))\n",
    "embeddings_df_corrected['attachment_style'] = embeddings_df_corrected.index.map(\n",
    "  dict(**LABEL_MAPPING_SYNTHETIC,\n",
    "       **LABEL_MAPPING_REAL)).fillna('unlabeled')\n",
    "embeddings_df_corrected['numerical_attachment_style'] = embeddings_df_corrected['attachment_style'].map({'avoidant':0,'preoccupied':1,'secure':2})"
   ],
   "id": "bebf5bbc8e9658e7",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## Cosine Similarities",
   "id": "7ecac1e64997f656"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def compute_cosine_similarities_per_class(embeddings_df: pd.DataFrame) -> dict[str, np.ndarray]:\n",
    "    \"\"\"\n",
    "    Computes cosine similarities between embeddings of every possible pair of interviews per attachment style in the \n",
    "    dataset.\n",
    "    \n",
    "    :param embeddings_df: a dataframe containing the interviews, associated embeddings and attachment styles.\n",
    "    :return: a dictionary containing a list of pairwise cosine similarities per attachment style.\n",
    "    \"\"\"    \n",
    "    embedding_cols = [c for c in embeddings_df.columns if 'emb' in c]\n",
    "    cosine_similarities = {'avoidant': [], 'preoccupied': [], 'secure': []}\n",
    "    for attachment_style, group in embeddings_df.groupby('attachment_style'):\n",
    "        for idx1, idx2 in combinations(group.index, 2):\n",
    "            sim = cosine_similarity(group.loc[idx1, embedding_cols].values.reshape(1, -1), group.loc[idx2, embedding_cols].values.reshape(1, -1))[0][0]\n",
    "            cosine_similarities[attachment_style].append(sim)\n",
    "    \n",
    "    return cosine_similarities\n",
    "    "
   ],
   "id": "c832ad0220bc291e",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "gpt_4_df = embeddings_df_no_correction[embeddings_df_no_correction.index.str.contains('gpt')]\n",
    "claude_df = embeddings_df_no_correction[embeddings_df_no_correction.index.str.contains('claude')]\n",
    "\n",
    "sims_gpt_4 = compute_cosine_similarities_per_class(gpt_4_df)\n",
    "sims_claude_3 = compute_cosine_similarities_per_class(claude_df)"
   ],
   "id": "43202ac384a89e78",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "w, h = calculate_best_figure_dimensions(subplots=(1, 2))\n",
    "fig, axs = plt.subplots(1, 2, figsize=(w, h*1.3))\n",
    "for attachment_style in ['avoidant', 'preoccupied', 'secure']:\n",
    "    sns.histplot(sims_gpt_4[attachment_style], kde=True, alpha=0.15, edgecolor=None,\n",
    "                 stat=\"density\", label=attachment_style, ax=axs[0], line_kws={'linewidth': 1}, binwidth=0.005)\n",
    "    sns.histplot(sims_claude_3[attachment_style], kde=True, alpha=0.15, edgecolor=None,\n",
    "                 stat=\"density\", label=attachment_style, ax=axs[1], line_kws={'linewidth': 1}, binwidth=0.005)\n",
    "\n",
    "axs[0].sharey(axs[1])\n",
    "axs[1].set_ylabel('')\n",
    "axs[0].set_xlabel('Cosine Similarity')\n",
    "axs[1].set_xlabel('Cosine Similarity')\n",
    "axs[0].set_title('GPT-4')\n",
    "axs[1].set_title('Claude 3 Opus')\n",
    "plt.tight_layout()\n",
    "axs[0].legend()\n",
    "axs[1].legend()\n",
    "\n",
    "save_plot(\"../../images/cosine_similarities_hist.pdf\", fig)"
   ],
   "id": "c0e83bb8d59d8c",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "## 2D Embeddings",
   "id": "5467cbdbc2ab1286"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def project_embeddings_to_2d(embeddings_df: pd.DataFrame) -> pd.DataFrame:\n",
    "  \"\"\"\n",
    "  Project embeddings to 2d embedding space.\n",
    "  \n",
    "  :param embeddings_df: a dataframe indexed by the interview id and each dimension of the original embedding space in an individual column.\n",
    "  :return: a dataframe indexed by the interview id and two columns encompassing the dimensions of the projected embeddings.\n",
    "  \"\"\"\n",
    "  ump = UMAP(random_state=RANDOM_SEED, n_jobs=1)\n",
    "  \n",
    "  embedding_cols = [c for c in embeddings_df.columns if 'emb' in c]\n",
    "  umap_data = pd.DataFrame(ump.fit_transform(embeddings_df[embedding_cols]),\n",
    "                           index = embeddings_df.index,\n",
    "                           columns = ['emb_1', 'emb_2'])\n",
    "  \n",
    "  non_embedding_cols = [c for c in embeddings_df.columns if 'emb' not in c]\n",
    "  for c in non_embedding_cols:\n",
    "      umap_data[c] = embeddings_df[c]\n",
    "  \n",
    "  return umap_data"
   ],
   "id": "ae60eda043ae8d4f",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# Excluding Claude 3\n",
    "embeddings_2d_no_correction_gpt_4 = project_embeddings_to_2d(embeddings_df_no_correction[~embeddings_df_no_correction.index.str.contains('claude')])\n",
    "embeddings_2d_corrected_gpt_4 = project_embeddings_to_2d(embeddings_df_corrected[~embeddings_df_corrected.index.str.contains('claude')])\n",
    "\n",
    "# Excluding GPT-4\n",
    "embeddings_2d_no_correction_claude_3 = project_embeddings_to_2d(embeddings_df_no_correction[~embeddings_df_no_correction.index.str.contains('gpt')])\n",
    "embeddings_2d_corrected_claude_3 = project_embeddings_to_2d(embeddings_df_corrected[~embeddings_df_corrected.index.str.contains('gpt')])\n",
    "\n",
    "# Include both GPT-4 and Claude-3\n",
    "embeddings_2d_no_correction = project_embeddings_to_2d(embeddings_df_no_correction)\n",
    "embeddings_2d_corrected = project_embeddings_to_2d(embeddings_df_corrected)"
   ],
   "id": "e0d5a213df497ec2",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def plot_umap_embeddings(umap_embeddings: pd.DataFrame, ax: plt.Axes):\n",
    "    \"\"\"\n",
    "    Plots umap embeddings.\n",
    "    \n",
    "    :param umap_embeddings: dataframe containing umap embeddings.\n",
    "    :param ax: matplotlib axes object.\n",
    "    \"\"\"\n",
    "\n",
    "    sns.scatterplot(\n",
    "        data=umap_embeddings[umap_embeddings['attachment_style'] == 'unlabeled'],\n",
    "        x='emb_1', y='emb_2',\n",
    "        hue='attachment_style',\n",
    "        markers=['o'],\n",
    "        alpha=0.1,\n",
    "        s=25,\n",
    "        ax=ax)\n",
    "    scatter = sns.scatterplot(\n",
    "        data=umap_embeddings[umap_embeddings['attachment_style'] != 'unlabeled'],\n",
    "        x='emb_1', y='emb_2',\n",
    "        hue='attachment_style',\n",
    "        style='is_synthetic',\n",
    "        markers=['o', '$\\circ$'],\n",
    "        alpha=1,\n",
    "        s=25,       \n",
    "        ax=ax)\n",
    "\n",
    "    # # Customizing the legend\n",
    "    handles, labels = scatter.get_legend_handles_labels()\n",
    "    handles = [handles[1], handles[0]] + handles[2:]\n",
    "    labels = [labels[1], labels[0]] + labels[2:]\n",
    "    labels[0] = 'Attachment Style'\n",
    "    labels[-3] = 'Data Type'\n",
    "    labels[-2] = 'real'\n",
    "    labels[-1] = 'synthetic'\n",
    "    # Add a dummy handle for spacing\n",
    "    handles.insert(5, plt.Line2D([], [], linestyle='none'))  # No line style\n",
    "    labels.insert(5, '')  # Empty string for label    \n",
    "    \n",
    "    ax.set_xlabel('UMAP Dimension 1')\n",
    "    ax.set_ylabel('UMAP Dimension 2')\n",
    "    ax.legend(handles, labels)"
   ],
   "id": "3fc824e6e1a7c73",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### GPT-4",
   "id": "4b9a6e5a885a5444"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "w, h = calculate_best_figure_dimensions(subplots=(1, 2))\n",
    "fig, axs = plt.subplots(1, 2, figsize=(w * 0.8, h * 1.3))\n",
    "plot_umap_embeddings(embeddings_2d_no_correction_gpt_4, axs[0])\n",
    "plot_umap_embeddings(embeddings_2d_corrected_gpt_4, axs[1])\n",
    "\n",
    "axs[0].legend_ = None\n",
    "axs[1].set_ylabel('')\n",
    "axs[0].set_title('Non-standardized')\n",
    "axs[1].set_title('Standardized')\n",
    "plt.tight_layout()\n",
    "# axs[1].legend(loc='upper right', bbox_to_anchor=(2, 1))\n",
    "\n",
    "bb = axs[1].get_position()\n",
    "axs[1].legend_.set_bbox_to_anchor((1.4, 0.4, bb.width, bb.height), transform=axs[1].transAxes)\n",
    "axs[1].legend_.set_frame_on(False)\n",
    "sns.despine(left=False, bottom=False)\n",
    "\n",
    "save_plot(\"../../images/embeddings_2d_umap_gpt_4.pdf\", fig)"
   ],
   "id": "61d2fcb825457ffe",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Claude 3",
   "id": "f44ac6cb37017b1b"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "w, h = calculate_best_figure_dimensions(subplots=(1, 2))\n",
    "fig, axs = plt.subplots(1, 2, figsize=(w * 0.8, h * 1.3))\n",
    "plot_umap_embeddings(embeddings_2d_no_correction_claude_3, axs[0])\n",
    "plot_umap_embeddings(embeddings_2d_corrected_claude_3, axs[1])\n",
    "\n",
    "axs[0].legend_ = None\n",
    "axs[1].set_ylabel('')\n",
    "axs[0].set_title('Non-standardized')\n",
    "axs[1].set_title('Standardized')\n",
    "plt.tight_layout()\n",
    "# axs[1].legend(loc='upper right', bbox_to_anchor=(2, 1))\n",
    "\n",
    "bb = axs[1].get_position()\n",
    "axs[1].legend_.set_bbox_to_anchor((1.4, 0.4, bb.width, bb.height), transform=axs[1].transAxes)\n",
    "axs[1].legend_.set_frame_on(False)\n",
    "sns.despine(left=False, bottom=False)\n",
    "\n",
    "save_plot(\"../../images/embeddings_2d_umap_claude_3.pdf\", fig)"
   ],
   "id": "7d5f569a402bab97",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Both",
   "id": "c57110c71bd3ac12"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "w, h = calculate_best_figure_dimensions(subplots=(1, 2))\n",
    "fig, axs = plt.subplots(1, 2, figsize=(w * 0.8, h * 1.3))\n",
    "plot_umap_embeddings(embeddings_2d_no_correction, axs[0])\n",
    "plot_umap_embeddings(embeddings_2d_corrected, axs[1])\n",
    "\n",
    "axs[0].legend_ = None\n",
    "axs[1].set_ylabel('')\n",
    "axs[0].set_title('Non-standardized')\n",
    "axs[1].set_title('Standardized')\n",
    "plt.tight_layout()\n",
    "# axs[1].legend(loc='upper right', bbox_to_anchor=(2, 1))\n",
    "\n",
    "bb = axs[1].get_position()\n",
    "axs[1].legend_.set_bbox_to_anchor((1.4, 0.4, bb.width, bb.height), transform=axs[1].transAxes)\n",
    "axs[1].legend_.set_frame_on(False)\n",
    "sns.despine(left=False, bottom=False)\n",
    "\n",
    "save_plot(\"../../images/embeddings_2d_umap_gpt_4_and_claude_3.pdf\", fig)"
   ],
   "id": "f6b65e8f326d0480",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "## Prediction\n",
    "\n",
    "### Human on Human (CVLOO)"
   ],
   "id": "7105647fd4c53778"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def get_num_pca_components(data: np.ndarray, perc_variance_to_retain: float = 0.99) -> int:\n",
    "    \"\"\"\n",
    "    Estimates the number of PCA components to explain a certain percentage of the variance.\n",
    "    \n",
    "    :param data: Array of shape (n_samples, n_features)\n",
    "    :param perc_variance_to_retain: percentage of variance to explain with the components.     \n",
    "    :return: number of PCA components.\n",
    "    \"\"\"\n",
    "    pca = PCA()\n",
    "    pca.fit(data)\n",
    "    return int(\n",
    "        np.where(pca.explained_variance_ratio_.cumsum() >= perc_variance_to_retain)[0].min()) + 1\n",
    "\n",
    "\n",
    "def eval_loo(model: BaseEstimator, embeddings_df: pd.DataFrame, num_repeat: int = 10) -> tuple[\n",
    "    np.ndarray, np.ndarray]:\n",
    "    \"\"\"\n",
    "    Evaluates the performance by doing LOO prediction.\n",
    "    \n",
    "    :param model: model to fit.\n",
    "    :param embeddings_df: a dataframe containing column embeddings and associated numerical_attachment_style.\n",
    "    :param num_repeat: the number of times to repeat the experiment. This should mitigate the effect of \n",
    "        different random seeds when training the models.\n",
    "    :return: a tuple containing a list of probabilities and test class, one per LOO split.\n",
    "    \"\"\"\n",
    "    \n",
    "    loo = LeaveOneOut()\n",
    "    embeddings_df = embeddings_df.sample(frac=1)\n",
    "    \n",
    "    if not hasattr(model, 'random_state'):\n",
    "        # No randomness in the model fitting\n",
    "        num_repeat = 1\n",
    "    \n",
    "    np.random.seed(RANDOM_SEED)  # For reproducibility\n",
    "    random_states = np.random.randint(0, 100, size=num_repeat)\n",
    "    \n",
    "    embedding_cols = [c for c in embeddings_df_labeled_gpt_4 if 'emb' in c]\n",
    "    # num_pca_components = get_num_pca_components(embeddings_df[embedding_cols].values)\n",
    "    # print(f'Number of PCA components: {num_pca_components}')\n",
    "    \n",
    "    y_true_all, y_pred_proba_all = [], []\n",
    "    for i in range(num_repeat):\n",
    "        y_true, y_pred_proba = [], []\n",
    "\n",
    "        for train_index, test_index in loo.split(embeddings_df):\n",
    "            # Split the data\n",
    "            X_train = np.array(embeddings_df.iloc[train_index][embedding_cols])\n",
    "            X_test = np.array(embeddings_df.iloc[test_index][embedding_cols])\n",
    "            y_train = np.array(embeddings_df.iloc[train_index]['numerical_attachment_style'])\n",
    "            y_test = np.array(embeddings_df.iloc[test_index]['numerical_attachment_style'])\n",
    "\n",
    "            # Train the classifier\n",
    "            cloned_model = clone(model)\n",
    "            if hasattr(cloned_model, 'random_state'):                \n",
    "                cloned_model.random_state = random_states[i]\n",
    "                \n",
    "            classifier = make_pipeline(\n",
    "                StandardScaler(),\n",
    "                # PCA(num_pca_components),\n",
    "                cloned_model\n",
    "            )            \n",
    "            classifier.fit(X_train, y_train)\n",
    "\n",
    "            y_pred_proba.append(classifier.predict_proba(X_test)[0])\n",
    "            y_true.append(y_test)\n",
    "\n",
    "        y_pred_proba_all.append(y_pred_proba)\n",
    "        y_true_all.append(y_true)\n",
    "\n",
    "    return np.array(y_pred_proba_all), np.array(y_true_all)\n",
    "\n",
    "\n",
    "def get_performance_table(y_pred_proba: np.ndarray, y_true: np.ndarray) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Prints the performance of predictions against ground truth labels.\n",
    "    \n",
    "    :param y_pred_proba: an array of probabilities of predictions.\n",
    "    :param y_true: an array of true labels.\n",
    "    :return: a dictionary containing the performance of predictions against ground truth labels per class and global.\n",
    "    \"\"\"\n",
    "    data = []\n",
    "    for i in range(y_pred_proba.shape[0]):\n",
    "        for j, attachment_style in enumerate(['avoidant', 'preoccupied', 'secure']):\n",
    "            data.append({\n",
    "                'roc_auc': roc_auc_score(y_true[i] == j, y_pred_proba[i, :, j], average='micro',\n",
    "                                         multi_class='ovr'),\n",
    "                'nll': log_loss(y_true[i] == j, y_pred_proba[i, :, j]),\n",
    "                'balanced_acc': balanced_accuracy_score(y_true[i] == j,\n",
    "                                                        y_pred_proba[i].argmax(axis=1) == j),\n",
    "                'attachment_style': attachment_style,\n",
    "                'run': i,\n",
    "            })\n",
    "        data.append({\n",
    "            'roc_auc': roc_auc_score(y_true[i], y_pred_proba[i], average='micro',\n",
    "                                     multi_class='ovr'),\n",
    "            'nll': log_loss(y_true[i], y_pred_proba[i]),\n",
    "            'balanced_acc': balanced_accuracy_score(y_true[i], y_pred_proba[i].argmax(axis=1)),\n",
    "            'attachment_style': 'all',\n",
    "            'run': i,\n",
    "        })\n",
    "    return pd.DataFrame(data)"
   ],
   "id": "c63807991993cfe0",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# model = LogisticRegression(penalty='l1', solver='liblinear')\n",
    "# model = LogisticRegression(penalty='l2', solver='liblinear')\n",
    "# model = ExtraTreesClassifier(n_estimators=500)\n",
    "model = MLPClassifier(max_iter=1000, hidden_layer_sizes=(16,))\n",
    "\n",
    "\n",
    "\n",
    "# from xgboost import XGBClassifier\n",
    "# \n",
    "# model = XGBClassifier(max_depth=9)\n",
    "# \n",
    "embeddings_df_labeled_human = embeddings_df_corrected[\n",
    "    (~embeddings_df_corrected['is_synthetic']) & (\n",
    "            embeddings_df_corrected['attachment_style'] != 'unlabeled')]\n",
    "res_df1 = get_performance_table(*eval_loo(model, embeddings_df_labeled_human))\n",
    "res_df1['embedding_condition'] = 'original'\n",
    "\n",
    "# embeddings_df_labeled_human = embeddings_2d_corrected_gpt_4[\n",
    "#     (~embeddings_2d_corrected_gpt_4['is_synthetic']) & (\n",
    "#             embeddings_2d_corrected_gpt_4['attachment_style'] != 'unlabeled')]\n",
    "# res_df2 = get_performance_table(*eval_loo(pipeline, embeddings_df_labeled_human))\n",
    "# res_df2['embedding_condition'] = 'UMAP (GPT-4)'\n",
    "\n",
    "# embeddings_df_labeled_human = embeddings_2d_corrected_claude_3[\n",
    "#     (~embeddings_2d_corrected_claude_3['is_synthetic']) & (\n",
    "#             embeddings_2d_corrected_claude_3['attachment_style'] != 'unlabeled')]\n",
    "# res_df3 = get_performance_table(*eval_loo(pipeline, embeddings_df_labeled_human))\n",
    "# res_df3['embedding_condition'] = 'UMAP (Claude 3)'\n",
    "# \n",
    "# embeddings_df_labeled_human = embeddings_2d_corrected[\n",
    "#     (~embeddings_2d_corrected['is_synthetic']) & (\n",
    "#             embeddings_2d_corrected['attachment_style'] != 'unlabeled')]\n",
    "# res_df4 = get_performance_table(*eval_loo(pipeline, embeddings_df_labeled_human))\n",
    "# res_df4['embedding_condition'] = 'UMAP (GPT + Claude)'\n",
    "\n",
    "res_df = res_df1 #pd.concat([res_df1, res_df2, res_df3, res_df4])\n",
    "res_df.groupby(['attachment_style', 'embedding_condition'])[\n",
    "    ['roc_auc', 'nll', 'balanced_acc']].agg(\n",
    "    lambda x: f'{x.mean():.4f} ({x.std() / np.sqrt(len(x)):.4f})')"
   ],
   "id": "388cc8e59d3a4887",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Synthetic on Human",
   "id": "293566a1ebfca97a"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def eval_train_test(model: BaseEstimator,\n",
    "                    training_embeddings_df: pd.DataFrame,\n",
    "                    test_embeddings_df: pd.DataFrame,\n",
    "                    num_repeat: int = 10) -> tuple[np.ndarray, np.ndarray]:\n",
    "    \"\"\"\n",
    "    Evaluates the performance by doing LOO prediction.\n",
    "    \n",
    "    :param model: model to fit.\n",
    "    :param training_embeddings_df: a dataframe containing column embeddings and associated numerical_attachment_style for training.\n",
    "    :param test_embeddings_df: a dataframe containing column embeddings and associated numerical_attachment_style for testing.\n",
    "    :param num_repeat: the number of times to repeat the experiment. This should mitigate the effect of different random seeds \n",
    "        when training the models.\n",
    "    :return: a tuple containing a list of probabilities and test class, one per LOO split.\n",
    "    \"\"\"\n",
    "\n",
    "    training_embeddings_df = training_embeddings_df.sample(frac=1)\n",
    "    embedding_cols = [c for c in training_embeddings_df if 'emb' in c]\n",
    "    \n",
    "    np.random.seed(RANDOM_SEED)  # For reproducibility\n",
    "    random_states = np.random.randint(0, 100, size=num_repeat)\n",
    "\n",
    "    # num_pca_components = get_num_pca_components(training_embeddings_df[embedding_cols].values)\n",
    "    # print(f'Number of PCA components: {num_pca_components}')\n",
    "\n",
    "    X_train = np.array(training_embeddings_df[embedding_cols])\n",
    "    X_test = np.array(test_embeddings_df[embedding_cols])\n",
    "    y_train = np.array(training_embeddings_df['numerical_attachment_style'])\n",
    "    y_test = np.array(test_embeddings_df['numerical_attachment_style'])\n",
    "\n",
    "    y_true_all, y_pred_proba_all = [], []\n",
    "    for i in range(num_repeat):\n",
    "        # Train the classifier\n",
    "        cloned_model = clone(model)\n",
    "        if hasattr(cloned_model, 'random_state'):                \n",
    "            cloned_model.random_state = random_states[i]\n",
    "            \n",
    "        classifier = make_pipeline(\n",
    "            StandardScaler(),\n",
    "            # PCA(num_pca_components),\n",
    "            cloned_model\n",
    "        )            \n",
    "        classifier.fit(X_train, y_train)\n",
    "        y_pred_proba = classifier.predict_proba(X_test)\n",
    "\n",
    "        y_true_all.append(y_test)\n",
    "        y_pred_proba_all.append(y_pred_proba)        \n",
    "\n",
    "    return np.array(y_pred_proba_all), np.array(y_true_all) "
   ],
   "id": "26851212599f2eee",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "#### GPT-4",
   "id": "d0868d7be91a3def"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# model = LogisticRegression(penalty='l1', solver='liblinear')\n",
    "# model = LogisticRegression(penalty='l2', solver='liblinear')\n",
    "# model = ExtraTreesClassifier(n_estimators=500)\n",
    "model = MLPClassifier(max_iter=1000, hidden_layer_sizes=(16,))\n",
    "\n",
    "embeddings_df_labeled_human = embeddings_df_no_correction[\n",
    "        (~embeddings_df_no_correction['is_synthetic']) & (\n",
    "                embeddings_df_no_correction['attachment_style'] != 'unlabeled')]\n",
    "embeddings_df_labeled_gpt_4 = embeddings_df_no_correction[embeddings_df_no_correction.index.str.contains('gpt')]\n",
    "res_df1 = get_performance_table(*eval_train_test(model, embeddings_df_labeled_gpt_4, embeddings_df_labeled_human))\n",
    "res_df1['embedding_condition'] = 'original'\n",
    "res_df1['standardized'] = False\n",
    "\n",
    "embeddings_df_labeled_human = embeddings_df_corrected[\n",
    "        (~embeddings_df_corrected['is_synthetic']) & (\n",
    "                embeddings_df_corrected['attachment_style'] != 'unlabeled')]\n",
    "embeddings_df_labeled_gpt_4 = embeddings_df_corrected[embeddings_df_corrected.index.str.contains('gpt')]\n",
    "res_df2 = get_performance_table(*eval_train_test(model, embeddings_df_labeled_gpt_4, embeddings_df_labeled_human))\n",
    "res_df2['embedding_condition'] = 'original'\n",
    "res_df2['standardized'] = True\n",
    "\n",
    "res_df = pd.concat([res_df1, res_df2])\n",
    "res_df.groupby(['attachment_style', 'embedding_condition', 'standardized'])[['roc_auc', 'nll', 'balanced_acc']].agg(lambda x:f'{x.mean():.4f} ({x.std() / np.sqrt(len(x)):.4f})')"
   ],
   "id": "1f72c9feceda3987",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "#### Claude 3",
   "id": "c6dc3d42c0feaf7c"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# model = LogisticRegression(penalty='l1', solver='liblinear')\n",
    "# model = LogisticRegression(penalty='l2', solver='liblinear')\n",
    "# model = ExtraTreesClassifier(n_estimators=500)\n",
    "model = MLPClassifier(max_iter=1000, hidden_layer_sizes=(16,))\n",
    "\n",
    "embeddings_df_labeled_human = embeddings_df_no_correction[\n",
    "        (~embeddings_df_no_correction['is_synthetic']) & (\n",
    "                embeddings_df_no_correction['attachment_style'] != 'unlabeled')]\n",
    "embeddings_df_labeled_claude_3 = embeddings_df_no_correction[embeddings_df_no_correction.index.str.contains('claude')]\n",
    "res_df1 = get_performance_table(*eval_train_test(model, embeddings_df_labeled_claude_3, embeddings_df_labeled_human))\n",
    "res_df1['embedding_condition'] = 'original'\n",
    "res_df1['standardized'] = False\n",
    "\n",
    "embeddings_df_labeled_human = embeddings_df_corrected[\n",
    "        (~embeddings_df_corrected['is_synthetic']) & (\n",
    "                embeddings_df_corrected['attachment_style'] != 'unlabeled')]\n",
    "embeddings_df_labeled_claude_3 = embeddings_df_corrected[embeddings_df_corrected.index.str.contains('claude')]\n",
    "res_df2 = get_performance_table(*eval_train_test(model, embeddings_df_labeled_claude_3, embeddings_df_labeled_human))\n",
    "res_df2['embedding_condition'] = 'original'\n",
    "res_df2['standardized'] = True\n",
    "\n",
    "res_df = pd.concat([res_df1, res_df2])\n",
    "res_df.groupby(['attachment_style', 'embedding_condition', 'standardized'])[['roc_auc', 'nll', 'balanced_acc']].agg(lambda x:f'{x.mean():.4f} ({x.std() / np.sqrt(len(x)):.4f})')"
   ],
   "id": "4f1f36ea2826f046",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Both",
   "id": "19687d98ee1ab510"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# model = LogisticRegression(penalty='l1', solver='liblinear')\n",
    "# model = LogisticRegression(penalty='l2', solver='liblinear')\n",
    "# model = ExtraTreesClassifier(n_estimators=500)\n",
    "model = MLPClassifier(max_iter=1000, hidden_layer_sizes=(16,))\n",
    "\n",
    "embeddings_df_labeled_human = embeddings_df_no_correction[\n",
    "        (~embeddings_df_no_correction['is_synthetic']) & (\n",
    "                embeddings_df_no_correction['attachment_style'] != 'unlabeled')]\n",
    "embeddings_df_labeled_both = embeddings_df_no_correction[embeddings_df_no_correction['is_synthetic']]\n",
    "res_df1 = get_performance_table(*eval_train_test(model, embeddings_df_labeled_both, embeddings_df_labeled_human))\n",
    "res_df1['embedding_condition'] = 'original'\n",
    "res_df1['standardized'] = False\n",
    "\n",
    "embeddings_df_labeled_human = embeddings_df_corrected[\n",
    "        (~embeddings_df_corrected['is_synthetic']) & (\n",
    "                embeddings_df_corrected['attachment_style'] != 'unlabeled')]\n",
    "embeddings_df_labeled_both = embeddings_df_corrected[embeddings_df_corrected['is_synthetic']]\n",
    "res_df2 = get_performance_table(*eval_train_test(model, embeddings_df_labeled_both, embeddings_df_labeled_human))\n",
    "res_df2['embedding_condition'] = 'original'\n",
    "res_df2['standardized'] = True\n",
    "\n",
    "res_df = pd.concat([res_df1, res_df2])\n",
    "res_df.groupby(['attachment_style', 'embedding_condition', 'standardized'])[['roc_auc', 'nll', 'balanced_acc']].agg(lambda x:f'{x.mean():.4f} ({x.std() / np.sqrt(len(x)):.4f})')"
   ],
   "id": "cf476932a94f3c6",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Synthetic Data Increment",
   "id": "fbc90fa2ba59efe2"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def eval_data_increment(model: BaseEstimator,\n",
    "                        synthetic_data: pd.DataFrame,\n",
    "                        labeled_data: pd.DataFrame,\n",
    "                        unlabeled_data: pd.DataFrame,\n",
    "                        use_2d_embeddings: bool,\n",
    "                        apply_correction: bool,\n",
    "                        initial_num_interviews_per_class: int,\n",
    "                        final_num_interviews_per_class: int,\n",
    "                        num_repetitions: int = 10) -> tuple[np.ndarray, np.ndarray]:\n",
    "    \"\"\"\n",
    "    Evaluates the prediction performance on the labeled human data as we add more synthetic data (interviews).\n",
    "    \n",
    "    :param model: the mode to fit.\n",
    "    :param synthetic_data: a dataframe containing the synthetic dataset to use.\n",
    "    :param labeled_data: a dataframe containing the labeled real dataset to use.\n",
    "    :param unlabeled_data: a dataframe containing the unlabeled real dataset to use.\n",
    "    :param use_2d_embeddings: whether to project embeddings using UMAP before fitting.\n",
    "    :param apply_correction: whether to normalize the synthetic embeddings using unlabeled data.\n",
    "    :param initial_num_interviews_per_class: the initial number of interviews per class to sample.\n",
    "    :param final_num_interviews_per_class: the final number of interviews per class to sample.\n",
    "    :param num_repetitions: the number of times we sample interviews.\n",
    "    :return: a tuple containing an array of mean AUCs (over sampling repetitions) per iteration another \n",
    "        array with associated standard errors.\n",
    "    \"\"\"\n",
    "    np.random.seed(RANDOM_SEED)  # For reproducibility\n",
    "\n",
    "    synthetic_interviews_per_class = {'avoidant': [], 'preoccupied': [], 'secure': []}\n",
    "    label_mapping_training = \\\n",
    "        synthetic_data[['interview_id', 'attachment_style']].drop_duplicates().set_index(\n",
    "            'interview_id')['attachment_style'].to_dict()\n",
    "    for interview_id, attachment_style in label_mapping_training.items():\n",
    "        synthetic_interviews_per_class[attachment_style].append(interview_id)\n",
    "\n",
    "    mean_aucs = []\n",
    "    std_aucs = []\n",
    "    for i in tqdm(range(initial_num_interviews_per_class, final_num_interviews_per_class + 1)):\n",
    "        aucs = []\n",
    "        for _ in range(num_repetitions):\n",
    "            selected_interviews = []\n",
    "            for _, interview_ids in synthetic_interviews_per_class.items():\n",
    "                selected_interviews.extend(np.random.choice(interview_ids, i, replace=False))\n",
    "\n",
    "            synthetic_data_mini = synthetic_data[\n",
    "                synthetic_data['interview_id'].isin(selected_interviews)]\n",
    "            embeddings_df = get_embeddings_per_interview(\n",
    "                labeled_dataset=labeled_data,\n",
    "                unlabeled_dataset=unlabeled_data,\n",
    "                synthetic_datasets=[synthetic_data_mini],\n",
    "                normalize_synthetic=apply_correction\n",
    "            )\n",
    "            embeddings_df['is_synthetic'] = embeddings_df.index.str.contains(\n",
    "                '|'.join(SYNTHETIC_MODEL_PREFIXES))\n",
    "            embeddings_df['attachment_style'] = embeddings_df.index.map(\n",
    "                dict(**LABEL_MAPPING_SYNTHETIC,\n",
    "                     **LABEL_MAPPING_REAL)).fillna('unlabeled')\n",
    "            embeddings_df['numerical_attachment_style'] = embeddings_df['attachment_style'].map(\n",
    "                {'avoidant': 0, 'preoccupied': 1, 'secure': 2})\n",
    "\n",
    "            if use_2d_embeddings:\n",
    "                embeddings_df = project_embeddings_to_2d(embeddings_df)\n",
    "\n",
    "            test_embeddings_df = embeddings_df[\n",
    "                (~embeddings_df['is_synthetic']) & (\n",
    "                        embeddings_df['attachment_style'] != 'unlabeled')]\n",
    "            training_embeddings_df = embeddings_df[embeddings_df['is_synthetic']]\n",
    "            res = get_performance_table(\n",
    "                *eval_train_test(model, training_embeddings_df, test_embeddings_df))\n",
    "            aucs.append(res[res['attachment_style'] == 'all']['roc_auc'].mean())  # Over all the runs\n",
    "\n",
    "        mean_aucs.append(np.mean(aucs))\n",
    "        std_aucs.append(np.std(aucs))\n",
    "\n",
    "    return np.array(mean_aucs), np.array(std_aucs) / np.sqrt(num_repetitions)"
   ],
   "id": "ab34b0d04ecaae09",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "def plot_data_increment(means: np.array, \n",
    "                        std_errs: np.array, \n",
    "                        ax: plt.Axes, \n",
    "                        series_label: str,\n",
    "                        marker_size: int,\n",
    "                        marker: str = 'o',\n",
    "                        line_width: float = 1):\n",
    "    \"\"\"\n",
    "    Plots data increment performance.\n",
    "    \n",
    "    :param means: mean values over different runs.\n",
    "    :param std_errs: standard error over different runs.\n",
    "    :param ax: matplotlib axes.\n",
    "    :param series_label: series label.\n",
    "    :param marker_size: the size of the marker.\n",
    "    :param marker: the marker of the plot.\n",
    "    :param line_width: the width of the line.\n",
    "    \"\"\"\n",
    "    xs = np.arange(1, len(means)+1)\n",
    "    ax.plot(xs, means, marker=marker, label=series_label, markersize=marker_size, linewidth=line_width, linestyle='dotted')\n",
    "    ax.fill_between(xs, means - std_errs, means + std_errs, alpha=0.2)    "
   ],
   "id": "b6a7cb89e481fc8a",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "### Different Conditions\n",
    "\n",
    "#### LR"
   ],
   "id": "60b91b4a24a2137"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# GPT-4\n",
    "mean_aucs_corrected_lr_l1_gpt_4, std_errs_corrected_lr_l1_gpt_4 = eval_data_increment(\n",
    "    model=LogisticRegression(penalty='l1', solver='liblinear'), \n",
    "    synthetic_data=gpt_4_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=True)\n",
    "\n",
    "mean_aucs_not_corrected_lr_l1_gpt_4, std_errs_not_corrected_lr_l1_gpt_4 = eval_data_increment(\n",
    "    model=LogisticRegression(penalty='l1', solver='liblinear'), \n",
    "    synthetic_data=gpt_4_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=False)\n",
    "\n",
    "mean_aucs_corrected_lr_l2_gpt_4, std_errs_corrected_lr_l2_gpt_4 = eval_data_increment(\n",
    "    model=LogisticRegression(penalty='l2', solver='liblinear'), \n",
    "    synthetic_data=gpt_4_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=True)\n",
    "\n",
    "mean_aucs_not_corrected_lr_l2_gpt_4, std_errs_not_corrected_lr_l2_gpt_4 = eval_data_increment(\n",
    "    model=LogisticRegression(penalty='l2', solver='liblinear'), \n",
    "    synthetic_data=gpt_4_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=False)\n",
    "\n",
    "# Claude 3\n",
    "mean_aucs_corrected_lr_l1_claude_3, std_errs_corrected_lr_l1_claude_3 = eval_data_increment(\n",
    "    model=LogisticRegression(penalty='l1', solver='liblinear'), \n",
    "    synthetic_data=claude_3_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=True)\n",
    "\n",
    "mean_aucs_not_corrected_lr_l1_claude_3, std_errs_not_corrected_lr_l1_claude_3 = eval_data_increment(\n",
    "    model=LogisticRegression(penalty='l1', solver='liblinear'), \n",
    "    synthetic_data=claude_3_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=False)\n",
    "\n",
    "mean_aucs_corrected_lr_l2_claude_3, std_errs_corrected_lr_l2_claude_3 = eval_data_increment(\n",
    "    model=LogisticRegression(penalty='l2', solver='liblinear'), \n",
    "    synthetic_data=claude_3_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=True)\n",
    "\n",
    "mean_aucs_not_corrected_lr_l2_claude_3, std_errs_not_corrected_lr_l2_claude_3 = eval_data_increment(\n",
    "    model=LogisticRegression(penalty='l2', solver='liblinear'), \n",
    "    synthetic_data=claude_3_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=False)"
   ],
   "id": "e44d779e48582521",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "#### ExtraTree",
   "id": "b0b17ae8424a32a1"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# GPT-4\n",
    "mean_aucs_corrected_etc_gpt_4, std_errs_corrected_etc_gpt_4 = eval_data_increment(\n",
    "    model=ExtraTreesClassifier(n_estimators=500), \n",
    "    synthetic_data=gpt_4_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=True)\n",
    "\n",
    "mean_aucs_not_corrected_etc_gpt_4, std_errs_not_corrected_etc_gpt_4 = eval_data_increment(\n",
    "    model=ExtraTreesClassifier(n_estimators=500), \n",
    "    synthetic_data=gpt_4_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=False)\n",
    "\n",
    "# Claude 3\n",
    "mean_aucs_corrected_etc_claude_3, std_errs_corrected_etc_claude_3 = eval_data_increment(\n",
    "    model=ExtraTreesClassifier(n_estimators=500), \n",
    "    synthetic_data=claude_3_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=True)\n",
    "\n",
    "mean_aucs_not_corrected_etc_claude_3, std_errs_not_corrected_etc_claude_3 = eval_data_increment(\n",
    "    model=ExtraTreesClassifier(n_estimators=500), \n",
    "    synthetic_data=claude_3_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=False)"
   ],
   "id": "3457216436247697",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "#### MLP",
   "id": "955aa3272511e7e"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "# GPT-4\n",
    "mean_aucs_corrected_mlp_gpt_4, std_errs_corrected_mlp_gpt_4 = eval_data_increment(\n",
    "    model=MLPClassifier(max_iter=1000, hidden_layer_sizes=(16,)), \n",
    "    synthetic_data=gpt_4_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=True)\n",
    "\n",
    "mean_aucs_not_corrected_mlp_gpt_4, std_errs_not_corrected_mlp_gpt_4 = eval_data_increment(\n",
    "    model=MLPClassifier(max_iter=1000, hidden_layer_sizes=(16,)), \n",
    "    synthetic_data=gpt_4_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=False)\n",
    "\n",
    "# Claude 3\n",
    "mean_aucs_corrected_mlp_claude_3, std_errs_corrected_mlp_claude_3 = eval_data_increment(\n",
    "    model=MLPClassifier(max_iter=1000, hidden_layer_sizes=(16,)), \n",
    "    synthetic_data=claude_3_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=True)\n",
    "\n",
    "mean_aucs_not_corrected_mlp_claude_3, std_errs_not_corrected_mlp_claude_3 = eval_data_increment(\n",
    "    model=MLPClassifier(max_iter=1000, hidden_layer_sizes=(16,)), \n",
    "    synthetic_data=claude_3_data,\n",
    "    labeled_data=labeled_data,\n",
    "    unlabeled_data=unlabeled_data,\n",
    "    initial_num_interviews_per_class=1,\n",
    "    final_num_interviews_per_class=20,\n",
    "    num_repetitions=10,\n",
    "    use_2d_embeddings=False,\n",
    "    apply_correction=False)"
   ],
   "id": "7b89dccca5a85f09",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### New data increment plot",
   "id": "527d3ca27a10514a"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "w, h = calculate_best_figure_dimensions(subplots=(2,2))\n",
    "fig, axs = plt.subplots(2, 2, figsize=(w, h*1.3))\n",
    "\n",
    "axs[0,0].set_title('LR ($\\ell1$)')\n",
    "axs[0,1].set_title('LR ($\\ell2$)')\n",
    "axs[1,0].set_title('Extra Trees')\n",
    "axs[1,1].set_title('MLP')\n",
    "\n",
    "for ax in axs.flatten():\n",
    "    ax.yaxis.set_major_locator(MultipleLocator(base=0.05))\n",
    "    ax.xaxis.set_major_locator(MultipleLocator(base=2))\n",
    "    ax.set_ylim([0.4,0.9])\n",
    "    ax.set_ylabel('ROC AUC')\n",
    "    ax.set_xlabel('Interviews per Class')\n",
    "\n",
    "axs[0,1].set_ylabel('')\n",
    "axs[1,1].set_ylabel('')\n",
    "axs[0,0].set_xlabel('')\n",
    "axs[0,1].set_xlabel('')\n",
    "axs[0,0].sharey(axs[0,1])\n",
    "axs[0,1].sharey(axs[1,0])\n",
    "axs[1,0].sharey(axs[1,1])\n",
    "\n",
    "plot_data_increment(mean_aucs_corrected_lr_l1_gpt_4, std_errs_corrected_lr_l1_gpt_4, axs[0,0], 'GPT-4', 3)\n",
    "plot_data_increment(mean_aucs_corrected_lr_l1_claude_3, std_errs_corrected_lr_l1_claude_3, axs[0,0], 'Claude 3 Opus', 3)\n",
    "\n",
    "plot_data_increment(mean_aucs_corrected_lr_l2_gpt_4, std_errs_corrected_lr_l2_gpt_4, axs[0,1], 'GPT-4', 3)\n",
    "plot_data_increment(mean_aucs_corrected_lr_l2_claude_3, std_errs_corrected_lr_l2_claude_3, axs[0,1], 'GPT-4', 3)\n",
    "\n",
    "plot_data_increment(mean_aucs_corrected_etc_gpt_4, std_errs_corrected_etc_gpt_4, axs[1,0], 'GPT-4', 3)\n",
    "plot_data_increment(mean_aucs_corrected_etc_claude_3, std_errs_corrected_etc_claude_3, axs[1,0], 'Claude 3 Opus', 3)\n",
    "\n",
    "plot_data_increment(mean_aucs_corrected_mlp_gpt_4, std_errs_corrected_mlp_gpt_4, axs[1,1], 'GPT-4', 3)\n",
    "plot_data_increment(mean_aucs_corrected_mlp_claude_3, std_errs_corrected_mlp_claude_3, axs[1,1], 'Claude 3 Opus', 3)\n",
    "\n",
    "sns.despine(left=False, bottom=False)\n",
    "plt.tight_layout()\n",
    "for ax in axs.flatten():\n",
    "    ax.legend()\n",
    "\n",
    "save_plot(\"../../images/data_increment_auc.pdf\", fig)"
   ],
   "id": "829928cb5364877",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "### Token Count",
   "id": "afd658d3e602dad8"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "import tiktoken\n",
    "encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
    "encoding = tiktoken.encoding_for_model(\"gpt-4\")"
   ],
   "id": "7b1606dd1ae295ee",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "num_tokens = []\n",
    "for _, row in pd.read_csv('../../datasets/synthetic_dataset_gpt-4.csv').iterrows():\n",
    "    num_tokens.append(len(encoding.encode(row['answer'])))\n",
    "print(f'{np.mean(num_tokens):.4f} ({np.std(num_tokens):.4f})')\n",
    "print(np.sum(num_tokens))"
   ],
   "id": "9ecd4c805eb61056",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": [
    "num_tokens = []\n",
    "for _, row in pd.read_csv('../../datasets/synthetic_dataset_claude-3-opus-20240229.csv').iterrows():\n",
    "    num_tokens.append(len(encoding.encode(row['answer'])))\n",
    "print(f'{np.mean(num_tokens):.4f} ({np.std(num_tokens):.4f})')\n",
    "print(np.sum(num_tokens))"
   ],
   "id": "a08a02a4a3191d06",
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "",
   "id": "f36ee2396783b02",
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
