{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dc592f58",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>audioId</th>\n",
       "      <th>annotator</th>\n",
       "      <th>emotions</th>\n",
       "      <th>human</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Emotional Numbness/0/465f5424_enhanced.mp3</td>\n",
       "      <td>Hume Voice</td>\n",
       "      <td>{'Amusement': 0.02543581183999777, 'Anger': 0....</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Emotional Numbness/0/8674701e_enhanced_4.mp3</td>\n",
       "      <td>Hume Voice</td>\n",
       "      <td>{'Amusement': 0.17442310228943825, 'Anger': 2....</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Emotional Numbness/0/b2821e81_enhanced_4.mp3</td>\n",
       "      <td>Hume Voice</td>\n",
       "      <td>{'Amusement': 0.11560654267668724, 'Anger': 1....</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Emotional Numbness/0/44c06ae0_enhanced_1.mp3</td>\n",
       "      <td>Hume Voice</td>\n",
       "      <td>{'Amusement': 0.1725613884627819, 'Anger': 1.6...</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Emotional Numbness/0/dd4e4b41_enhanced_2.mp3</td>\n",
       "      <td>Hume Voice</td>\n",
       "      <td>{'Amusement': 0.21406544372439384, 'Anger': 0....</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119700</th>\n",
       "      <td>Triumph/3/8daa75c3_enhanced_2.mp3</td>\n",
       "      <td>Human 1</td>\n",
       "      <td>{'Triumph': 10.0}</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119701</th>\n",
       "      <td>Triumph/4/2c696580_enhanced_1.mp3</td>\n",
       "      <td>Human 1</td>\n",
       "      <td>{'Triumph': 5.0}</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119702</th>\n",
       "      <td>Triumph/4/70d4e597_enhanced_2.mp3</td>\n",
       "      <td>Human 1</td>\n",
       "      <td>{'Triumph': 5.0}</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119703</th>\n",
       "      <td>Triumph/4/995575ca_enhanced_1.mp3</td>\n",
       "      <td>Human 1</td>\n",
       "      <td>{'Triumph': 0.0}</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119704</th>\n",
       "      <td>Triumph/4/d2032ec5_enhanced_3.mp3</td>\n",
       "      <td>Human 1</td>\n",
       "      <td>{'Triumph': 10.0}</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>119705 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             audioId   annotator  \\\n",
       "0         Emotional Numbness/0/465f5424_enhanced.mp3  Hume Voice   \n",
       "1       Emotional Numbness/0/8674701e_enhanced_4.mp3  Hume Voice   \n",
       "2       Emotional Numbness/0/b2821e81_enhanced_4.mp3  Hume Voice   \n",
       "3       Emotional Numbness/0/44c06ae0_enhanced_1.mp3  Hume Voice   \n",
       "4       Emotional Numbness/0/dd4e4b41_enhanced_2.mp3  Hume Voice   \n",
       "...                                              ...         ...   \n",
       "119700             Triumph/3/8daa75c3_enhanced_2.mp3     Human 1   \n",
       "119701             Triumph/4/2c696580_enhanced_1.mp3     Human 1   \n",
       "119702             Triumph/4/70d4e597_enhanced_2.mp3     Human 1   \n",
       "119703             Triumph/4/995575ca_enhanced_1.mp3     Human 1   \n",
       "119704             Triumph/4/d2032ec5_enhanced_3.mp3     Human 1   \n",
       "\n",
       "                                                 emotions  human  \n",
       "0       {'Amusement': 0.02543581183999777, 'Anger': 0....  False  \n",
       "1       {'Amusement': 0.17442310228943825, 'Anger': 2....  False  \n",
       "2       {'Amusement': 0.11560654267668724, 'Anger': 1....  False  \n",
       "3       {'Amusement': 0.1725613884627819, 'Anger': 1.6...  False  \n",
       "4       {'Amusement': 0.21406544372439384, 'Anger': 0....  False  \n",
       "...                                                   ...    ...  \n",
       "119700                                  {'Triumph': 10.0}   True  \n",
       "119701                                   {'Triumph': 5.0}   True  \n",
       "119702                                   {'Triumph': 5.0}   True  \n",
       "119703                                   {'Triumph': 0.0}   True  \n",
       "119704                                  {'Triumph': 10.0}   True  \n",
       "\n",
       "[119705 rows x 4 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import ast\n",
    "import numpy as np\n",
    "import json\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import re\n",
    "\n",
    "df = pd.read_csv('../data/all.csv')\n",
    "df[['audioId', 'annotator', 'human', \"emotions\"]]\n",
    "\n",
    "df.drop(columns=['intensity', 'emotion_prompt'], inplace=True)\n",
    "\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a279539c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Annotator: Hume Voice, Samples: 12600\n",
      "Annotator: GPT-4o Mini Audio Preview, Samples: 12600\n",
      "Annotator: Gemini 2.5 Pro, Samples: 12600\n",
      "Annotator: GPT-4o Audio Preview 2024-12-17, Samples: 10500\n",
      "Annotator: Empathic Insight Voice Small, Samples: 12600\n",
      "Annotator: Empathic Insight Voice Large, Samples: 12600\n",
      "Annotator: Gemini 2.0 Flash, Samples: 12600\n",
      "Annotator: Human 2, Samples: 6620\n",
      "Annotator: Human 4, Samples: 11605\n",
      "Annotator: Human 1, Samples: 6837\n",
      "Annotator: Human 6, Samples: 5600\n",
      "Annotator: Human 3, Samples: 2600\n",
      "Annotator: Human 5, Samples: 343\n"
     ]
    }
   ],
   "source": [
    "for m in df['annotator'].unique():\n",
    "    n_samples = df[df['annotator'] == m].shape[0]\n",
    "    print(f\"Annotator: {m}, Samples: {n_samples}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "854942f3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>audioId</th>\n",
       "      <th>scores</th>\n",
       "      <th>emotion</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Affection/0/1f885d6f_enhanced.mp3</td>\n",
       "      <td>{'Hume Voice': 0.05169641226530075, 'GPT-4o Mi...</td>\n",
       "      <td>Affection</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Affection/0/4cb03b26_enhanced_1.mp3</td>\n",
       "      <td>{'Hume Voice': 0.23765334859490395, 'GPT-4o Mi...</td>\n",
       "      <td>Affection</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Affection/0/90aa20a0_enhanced_3.mp3</td>\n",
       "      <td>{'Hume Voice': 0.09479721076786518, 'GPT-4o Mi...</td>\n",
       "      <td>Affection</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Affection/0/a3a29774_enhanced_1.mp3</td>\n",
       "      <td>{'Hume Voice': 0.04231022670865059, 'GPT-4o Mi...</td>\n",
       "      <td>Affection</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Affection/0/fc9b285d_enhanced_1.mp3</td>\n",
       "      <td>{'Hume Voice': 0.3538057208061218, 'GPT-4o Min...</td>\n",
       "      <td>Affection</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               audioId  \\\n",
       "0    Affection/0/1f885d6f_enhanced.mp3   \n",
       "1  Affection/0/4cb03b26_enhanced_1.mp3   \n",
       "2  Affection/0/90aa20a0_enhanced_3.mp3   \n",
       "3  Affection/0/a3a29774_enhanced_1.mp3   \n",
       "4  Affection/0/fc9b285d_enhanced_1.mp3   \n",
       "\n",
       "                                              scores    emotion  \n",
       "0  {'Hume Voice': 0.05169641226530075, 'GPT-4o Mi...  Affection  \n",
       "1  {'Hume Voice': 0.23765334859490395, 'GPT-4o Mi...  Affection  \n",
       "2  {'Hume Voice': 0.09479721076786518, 'GPT-4o Mi...  Affection  \n",
       "3  {'Hume Voice': 0.04231022670865059, 'GPT-4o Mi...  Affection  \n",
       "4  {'Hume Voice': 0.3538057208061218, 'GPT-4o Min...  Affection  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from ast import literal_eval\n",
    "from collections import defaultdict\n",
    "\n",
    "# Filter human annotations\n",
    "human_df = df[df['human'] == True].copy()\n",
    "audioid2emotions = {}\n",
    "for _, row in human_df.iterrows():\n",
    "    emotions = literal_eval(row['emotions'])\n",
    "    audioid2emotions.setdefault(row['audioId'], set()).update(\n",
    "        [k for k, v in emotions.items()]\n",
    "    )\n",
    "\n",
    "all_emotions_should_be_present = [\"Empathic Insight Voice Large\", \"Empathic Insight Voice Small\", \"Gemini 2.0 Flash\", \"Gemini 2.5 Pro\", \"Hume Voice\"]\n",
    "\n",
    "# Prepare rows for the new DataFrame\n",
    "rows = []\n",
    "for audioId, emotions_set in audioid2emotions.items():\n",
    "    # For each emotion annotated by a human for this audioId\n",
    "    for emotion in emotions_set:\n",
    "        # Collect all annotators' scores for this emotion and audioId\n",
    "        scores = {}\n",
    "        for _, row in df[df['audioId'] == audioId].iterrows():\n",
    "            emo_dict = literal_eval(row['emotions']) if isinstance(row['emotions'], str) else row['emotions']\n",
    "            if emotion in emo_dict:\n",
    "                scores[row['annotator']] = emo_dict[emotion]\n",
    "        rows.append({\n",
    "            \"audioId\": audioId,\n",
    "            \"scores\": scores,\n",
    "            \"emotion\": emotion\n",
    "        })\n",
    "\n",
    "small_df = pd.DataFrame(rows)\n",
    "small_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "463b0d8b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset: Hume Voice with 12600 vs 7666\n",
      "Dataset: GPT-4o Mini Audio Preview with 12600 vs 12315\n",
      "Dataset: Gemini 2.5 Pro with 12600 vs 12600\n",
      "Dataset: GPT-4o Audio Preview 2024-12-17 with 10500 vs 9124\n",
      "Dataset: Empathic Insight Voice Small with 12600 vs 12600\n",
      "Dataset: Empathic Insight Voice Large with 12600 vs 12600\n",
      "Dataset: Gemini 2.0 Flash with 12600 vs 12599\n",
      "Dataset: Human 2 with 6620 vs 6620\n",
      "Dataset: Human 4 with 11605 vs 11605\n",
      "Dataset: Human 1 with 6837 vs 6837\n",
      "Dataset: Human 6 with 5600 vs 5600\n",
      "Dataset: Human 3 with 2600 vs 2600\n",
      "Dataset: Human 5 with 343 vs 343\n"
     ]
    }
   ],
   "source": [
    "for annotator in df['annotator'].unique():\n",
    "    n_samples = df[df['annotator'] == annotator].shape[0]\n",
    "    n_new = small_df[small_df['scores'].apply(lambda x: annotator in x)].shape[0]\n",
    "    print(f\"Dataset: {annotator} with {n_samples} vs {n_new}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "183a33ef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'GPT-4o Mini Audio Preview': 4,\n",
       " 'Gemini 2.5 Pro': 4,\n",
       " 'Empathic Insight Voice Small': 5.631923079490662,\n",
       " 'Empathic Insight Voice Large': 4.404549598693848,\n",
       " 'Gemini 2.0 Flash': 0,\n",
       " 'Human 2': 0.0,\n",
       " 'Human 4': 0.0}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "small_df.iloc[100].scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "02dca72d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from ast import literal_eval\n",
    "import pandas as pd\n",
    "\n",
    "# Prepare a DataFrame to collect errors\n",
    "model_errors = {}\n",
    "\n",
    "for _, row in small_df.iterrows():\n",
    "    scores = row['scores']\n",
    "    # Identify human and model scores\n",
    "    human_scores = {k: v for k, v in scores.items() if k.lower().startswith('human')}\n",
    "    model_scores = {k: v for k, v in scores.items() if not k.lower().startswith('human')}\n",
    "    if not human_scores or not model_scores:\n",
    "        continue\n",
    "    # Use the mean of human scores if multiple humans\n",
    "    human_score = sum(human_scores.values()) / len(human_scores)\n",
    "    for model, model_score in model_scores.items():\n",
    "        abs_err = abs(model_score - human_score)\n",
    "        model_errors.setdefault(model, []).append(abs_err)\n",
    "\n",
    "# Compute mean absolute error for each model\n",
    "mae = {model: sum(errs)/len(errs) for model, errs in model_errors.items()}\n",
    "\n",
    "# Display sorted results\n",
    "mae_df = pd.DataFrame(list(mae.items()), columns=['Model', 'Mean Absolute Error']).sort_values('Mean Absolute Error')\n",
    "mae_df.reset_index(drop=True, inplace=True)\n",
    "mae_df.to_latex('output/mae.tex', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "2e109039",
   "metadata": {},
   "outputs": [],
   "source": [
    "from ast import literal_eval\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from scipy.stats import pearsonr, spearmanr\n",
    "from sklearn.metrics import r2_score # Using sklearn's r2_score is robust\n",
    "\n",
    "# --- (Assuming small_df is already loaded and 'scores' column might need literal_eval) ---\n",
    "# Example: If 'scores' are strings like \"{'human1': 5, 'modelA': 4.5}\"\n",
    "# small_df['scores'] = small_df['scores'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)\n",
    "# If 'scores' are already dictionaries, the above line is not needed.\n",
    "\n",
    "# Prepare a dictionary to collect data for metric calculation\n",
    "# Instead of just absolute errors, we'll store actual predictions and human scores per model\n",
    "model_performance_data = {}\n",
    "\n",
    "for _, row in small_df.iterrows():\n",
    "    # Assuming 'scores' is already a dictionary. If it's a string, uncomment literal_eval above.\n",
    "    scores_dict = row['scores']\n",
    "    if not isinstance(scores_dict, dict):\n",
    "        print(f\"Warning: 'scores' in row is not a dict: {scores_dict}. Skipping.\")\n",
    "        continue\n",
    "\n",
    "    # Identify human and model scores\n",
    "    human_scores_map = {k: v for k, v in scores_dict.items() if k.lower().startswith('human')}\n",
    "    model_scores_map = {k: v for k, v in scores_dict.items() if not k.lower().startswith('human')}\n",
    "\n",
    "    if not human_scores_map or not model_scores_map:\n",
    "        continue\n",
    "\n",
    "    # Use the mean of human scores if multiple humans\n",
    "    human_mean_score = sum(human_scores_map.values()) / len(human_scores_map)\n",
    "\n",
    "    for model_name, model_pred_score in model_scores_map.items():\n",
    "        # Initialize data storage for the model if it's the first time we see it\n",
    "        model_entry = model_performance_data.setdefault(model_name, {\n",
    "            'predictions': [],\n",
    "            'actuals': [], # List of (mean) human scores corresponding to predictions\n",
    "            'errors': [],    # (model_pred_score - human_mean_score)\n",
    "        })\n",
    "        \n",
    "        model_entry['predictions'].append(model_pred_score)\n",
    "        model_entry['actuals'].append(human_mean_score)\n",
    "        model_entry['errors'].append(model_pred_score - human_mean_score)\n",
    "\n",
    "# Compute metrics for each model\n",
    "results_list = []\n",
    "for model_name, data in model_performance_data.items():\n",
    "    if not data['predictions']: # Should not happen if data was added\n",
    "        continue\n",
    "\n",
    "    predictions = np.array(data['predictions'])\n",
    "    actuals = np.array(data['actuals'])\n",
    "    errors = np.array(data['errors']) # These are (prediction - actual)\n",
    "\n",
    "    n_samples = len(predictions)\n",
    "    \n",
    "    mae = np.mean(np.abs(errors))\n",
    "    mse = np.mean(errors**2)\n",
    "    rmse = np.sqrt(mse)\n",
    "    bias = np.mean(errors) # Mean Error\n",
    "    std_dev_errors = np.std(errors)\n",
    "    \n",
    "    # Correlation and R-squared need at least 2 data points to be meaningful\n",
    "    # and for variance to be non-zero.\n",
    "    pearson_corr = np.nan\n",
    "    r_squared_val = np.nan\n",
    "\n",
    "    if n_samples >= 2:\n",
    "\n",
    "        try:\n",
    "            spearman_corr, _ = spearmanr(actuals, predictions)\n",
    "        except ValueError:\n",
    "            spearman_corr = np.nan\n",
    "\n",
    "        # Check for zero variance in actuals or predictions which can cause issues\n",
    "        if np.var(actuals) > 1e-9 and np.var(predictions) > 1e-9 : # Check for non-constant series\n",
    "            try:\n",
    "                pearson_corr, _ = pearsonr(actuals, predictions)\n",
    "            except ValueError: # Can happen if one series is constant\n",
    "                 pearson_corr = np.nan\n",
    "        else: # If one is constant, correlation is often undefined or 0\n",
    "            if np.array_equal(actuals, predictions): # Perfect match for constant series\n",
    "                 pearson_corr = 1.0\n",
    "            else:\n",
    "                 pearson_corr = 0.0 # Or np.nan, depending on desired behavior\n",
    "\n",
    "        r_squared_val = r2_score(actuals, predictions)\n",
    "\n",
    "\n",
    "    results_list.append({\n",
    "        'Model': model_name,\n",
    "        'N Samples': n_samples,\n",
    "        'Spearman r': spearman_corr,\n",
    "        'Pearson r': pearson_corr,\n",
    "        'MAE': mae,\n",
    "        'RMSE': rmse,\n",
    "    })\n",
    "\n",
    "# Create DataFrame from the results\n",
    "metrics_df = pd.DataFrame(results_list)\n",
    "\n",
    "# Sort by MAE (or any other preferred metric)\n",
    "metrics_df = metrics_df.sort_values('Spearman r', ascending=False).reset_index(drop=True)\n",
    "\n",
    "# Display results\n",
    "metrics_df.to_latex('output/metrics.tex', index=False, float_format=\"%.3f\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
