{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "46a52fd7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd # Make sure pandas is imported\n",
    "import matplotlib.pyplot as plt\n",
    "import ast # For ast.literal_eval\n",
    "import pandas as pd\n",
    "import ast\n",
    "import os\n",
    "\n",
    "df = pd.read_csv('../data/all.csv')\n",
    "\n",
    "os.makedirs('./output', exist_ok=True)\n",
    "\n",
    "# Only use human annotations\n",
    "humandf = df[df['human']]\n",
    "\n",
    "os.makedirs\n",
    "\n",
    "# For each audioId and emotion, collect all annotator scores (0, 5, 10)\n",
    "records = []\n",
    "for _, row in humandf.iterrows():\n",
    "    audio_id = row['audioId']\n",
    "    annotator = row['annotator']\n",
    "    emotions_dict = ast.literal_eval(row['emotions'])\n",
    "    for emo, val in emotions_dict.items():\n",
    "        records.append({'audioId': audio_id, 'emotion': emo, 'annotator': annotator, 'value': val})\n",
    "\n",
    "df_long = pd.DataFrame(records)\n",
    "\n",
    "# Aggregate by audioId and emotion, collect all values for that (audioId, emotion)\n",
    "df_agg = df_long.groupby(['audioId', 'emotion'])['value'].apply(list).reset_index()\n",
    "\n",
    "# Keep only rows with at least two annotations (as per original logic for agreement analysis)\n",
    "df_agg = df_agg[df_agg['value'].map(len) >= 2]\n",
    "\n",
    "\n",
    "# For each row, count number of present/absent and determine agreement type\n",
    "def categorize_agreement(values):\n",
    "    n = len(values)\n",
    "    n_present = sum(v > 0 for v in values)\n",
    "    # n_absent = n - n_present # Not directly used in this categorization\n",
    "\n",
    "    if n == 2:\n",
    "        if n_present == 2:\n",
    "            return '2:0 Agreement (+)'\n",
    "        elif n_present == 0: # Both absent\n",
    "            return '2:0 Agreement (-)'\n",
    "        else: # n_present == 1\n",
    "            return '1:1 Disagreement'\n",
    "    elif n == 3:\n",
    "        # Based on the rule: 3rd annotation happens if first two rated >= 5 (present)\n",
    "        # However, we should categorize based on the actual data present in `values`\n",
    "        # as the data might not strictly follow the ideal process flow for all cases.\n",
    "        if n_present == 3:\n",
    "            return '3:0 Agreement (+)'\n",
    "        elif n_present == 2:\n",
    "            # This is the \"2:1 in favor\" case if the 3rd annotator marked 0\n",
    "            # after the first two marked present.\n",
    "            return '2:1 Partial (+ favored)'\n",
    "        elif n_present == 1:\n",
    "            # This is the \"2:1 against\" case (1 present, 2 absent)\n",
    "            return '1:2 Partial (- favored)'\n",
    "        elif n_present == 0:\n",
    "            return '3:0 Agreement (-)'\n",
    "        else: # Should not happen with n_present as 0,1,2,3\n",
    "            return f'Other ({n} annotators, {n_present} +)'\n",
    "    elif n > 0 : # For cases with >3 or only 1 annotator (though we filtered for len > 1)\n",
    "         return f'Other'\n",
    "    else: # Should not happen if df_agg is not empty\n",
    "        return 'Undefined (0 annotators)'\n",
    "\n",
    "\n",
    "df_agg['agreement_type'] = df_agg['value'].apply(categorize_agreement)\n",
    "\n",
    "# Define the order of categories for the stacked bar plot (from most positive to most negative/disagreement)\n",
    "# And also all possible categories that might appear from the function.\n",
    "plot_categories_ordered = [\n",
    "    '3:0 Agreement (+)',\n",
    "    '2:0 Agreement (+)',\n",
    "    '2:1 Partial (+ favored)',\n",
    "    '1:1 Disagreement',\n",
    "    '1:2 Partial (- favored)',\n",
    "    '2:0 Agreement (-)',\n",
    "    '3:0 Agreement (-)',\n",
    "    'Other'\n",
    "    # Add any 'Other' categories if they appear in your data and you want to plot them\n",
    "]\n",
    "# Filter out categories that might not actually exist in the data to avoid errors\n",
    "possible_categories_in_data = df_agg['agreement_type'].unique()\n",
    "plot_categories_ordered = [cat for cat in plot_categories_ordered if cat in possible_categories_in_data]\n",
    "\n",
    "\n",
    "# For each emotion, calculate proportion of each agreement type\n",
    "emotions_unique = df_agg['emotion'].unique()\n",
    "# Original sorting was by median agreement score, let's define a new sorting logic later if needed.\n",
    "# For now, just sort alphabetically for consistency before applying a data-driven sort.\n",
    "emotions_sorted_alpha = sorted(emotions_unique)\n",
    "\n",
    "\n",
    "data_for_plot = []\n",
    "for emo in emotions_sorted_alpha: # Use alphabetically sorted for now\n",
    "    subset = df_agg[df_agg['emotion'] == emo]\n",
    "    total_for_emotion = len(subset)\n",
    "    if total_for_emotion == 0:\n",
    "        # Add entry with all zeros if an emotion somehow has no valid agreement types\n",
    "        entry = {'emotion': emo}\n",
    "        for cat in plot_categories_ordered:\n",
    "            entry[cat] = 0\n",
    "        data_for_plot.append(entry)\n",
    "        continue\n",
    "\n",
    "    counts = subset['agreement_type'].value_counts(normalize=True)\n",
    "    entry = {'emotion': emo}\n",
    "    for cat in plot_categories_ordered:\n",
    "        entry[cat] = counts.get(cat, 0)\n",
    "    data_for_plot.append(entry)\n",
    "\n",
    "plot_df = pd.DataFrame(data_for_plot)\n",
    "\n",
    "# Add n, %2-ratings, %3-ratings per emotion\n",
    "def count_ratings_info(subset_df_agg, emotion_name):\n",
    "    subset = subset_df_agg[subset_df_agg['emotion'] == emotion_name]\n",
    "    num_ratings = subset['value'].apply(len)\n",
    "    total_instances = len(subset)\n",
    "    if total_instances == 0:\n",
    "        return 0, 0, 0, 0\n",
    "\n",
    "    n2 = (num_ratings == 2).sum()\n",
    "    n3 = (num_ratings == 3).sum()\n",
    "    n_other = total_instances - n2 - n3\n",
    "\n",
    "    pct2 = 100 * n2 / total_instances if total_instances > 0 else 0\n",
    "    pct3 = 100 * n3 / total_instances if total_instances > 0 else 0\n",
    "    return total_instances, pct2, pct3, n_other\n",
    "\n",
    "n_list, pct2_list, pct3_list, n_other_list = [], [], [], []\n",
    "for emo in plot_df['emotion']: # Iterate over emotions in plot_df\n",
    "    n, pct2, pct3, n_other = count_ratings_info(df_agg, emo)\n",
    "    n_list.append(n)\n",
    "    pct2_list.append(pct2)\n",
    "    pct3_list.append(pct3)\n",
    "    n_other_list.append(n_other)\n",
    "\n",
    "plot_df['n'] = n_list\n",
    "plot_df['pct2_ratings'] = pct2_list\n",
    "plot_df['pct3_ratings'] = pct3_list\n",
    "plot_df['n_other_ratings'] = n_other_list\n",
    "\n",
    "\n",
    "# Sort emotions for plotting: by sum of 'strong present' agreement types (descending)\n",
    "# then by total 'n' (descending) as a tie-breaker\n",
    "plot_df['strong_present_agreement_sum'] = plot_df.get('3:0 Agreement (+)', 0) + \\\n",
    "                                          plot_df.get('2:0 Agreement (+)', 0)\n",
    "plot_df = plot_df.sort_values(by=['strong_present_agreement_sum', 'n'], ascending=[False, False]).reset_index(drop=True)\n",
    "\n",
    "\n",
    "# Define colors and labels for the new categories\n",
    "# Using a more distinct color palette\n",
    "colors = {\n",
    "    '3:0 Agreement (+)': '#2ca02c',  # Green\n",
    "    '2:0 Agreement (+)': '#98df8a',  # Light Green\n",
    "    '2:1 Partial (+)': '#ffbb78',  # Light Orange/Yellow\n",
    "    '1:1 Disagreement': '#ff7f0e',  # Orange\n",
    "    '1:2 Partial (- favored)': '#aec7e8',  # Light Blue\n",
    "    '2:0 Agreement (-)': '#1f77b4',  # Blue\n",
    "    '3:0 Agreement (-)': '#17becf',   # Cyan\n",
    "    'Other': '#d9d9d9'  # Light Gray\n",
    "    # Add 'Other' if it appears\n",
    "}\n",
    "hatches = {\n",
    "    '3:0 Agreement (+)': '++',\n",
    "    '2:0 Agreement (+)': '//',\n",
    "    '2:1 Partial (+ favored)': '.',\n",
    "    '1:1 Disagreement': 'xx',\n",
    "    '1:2 Partial (- favored)': '..',\n",
    "    '2:0 Agreement (-)': '\\\\\\\\',\n",
    "    '3:0 Agreement (-)': '--',\n",
    "    'Other': 'oo'\n",
    "}\n",
    "# Labels are the keys themselves for simplicity, or you can customize\n",
    "labels = {key: key for key in plot_categories_ordered}\n",
    "\n",
    "\n",
    "fig, ax = plt.subplots(figsize=(12, max(6, len(plot_df) * 0.2))) # Adjust height based on num emotions\n",
    "bottom = np.zeros(len(plot_df))\n",
    "bars_handles = [] # To store one bar handle per category for the legend\n",
    "\n",
    "for cat_key in plot_categories_ordered: # Iterate through our defined order\n",
    "    if cat_key in plot_df.columns: # Check if this category has data\n",
    "        bar = ax.barh(\n",
    "            plot_df['emotion'],\n",
    "            plot_df[cat_key],\n",
    "            left=bottom,\n",
    "            color=colors.get(cat_key, '#808080'), # Default to gray if not defined\n",
    "            label=labels.get(cat_key, cat_key),\n",
    "            hatch=hatches.get(cat_key, ''),\n",
    "            edgecolor='black',\n",
    "            align='center'\n",
    "        )\n",
    "        # Only add the first bar handle for each category for the legend\n",
    "        if len(bar) > 0:\n",
    "            bars_handles.append(bar[0])\n",
    "        bottom += plot_df[cat_key].fillna(0)\n",
    "\n",
    "# Add n, %2r, %3r as text on the right of each bar\n",
    "for i, row_data in plot_df.iterrows():\n",
    "    n_val = row_data['n']\n",
    "    pct2_val = row_data['pct2_ratings']\n",
    "    pct3_val = row_data['pct3_ratings']\n",
    "    text_label = f\"n={n_val} | 2r: {pct2_val:.0f}% | 3r: {pct3_val:.0f}%\"\n",
    "    if row_data['n_other_ratings'] > 0:\n",
    "        text_label += f\" | Other: {row_data['n_other_ratings']}\"\n",
    "    ax.text(\n",
    "        1.015, i,\n",
    "        text_label,\n",
    "        va='center', ha='left', fontsize=9,\n",
    "        color='black',\n",
    "    )\n",
    "\n",
    "ax.set_xlabel('Proportion of Annotations')\n",
    "ax.set_ylabel('Emotion')\n",
    "ax.set_title('Annotator Agreement on Emotion Presence (Human Ratings Only)')\n",
    "\n",
    "# Fix legend: show all agreement types, outside the plot\n",
    "ax.legend(\n",
    "    bars_handles,\n",
    "    [labels.get(cat_key, cat_key) for cat_key in plot_categories_ordered if cat_key in plot_df.columns],\n",
    "    title='Agreement Type',\n",
    "    loc='center left',\n",
    "    bbox_to_anchor=(0.01, 0.14),\n",
    "    borderaxespad=0.\n",
    ")\n",
    "\n",
    "plt.xlim(0, 1) # Proportions should be between 0 and 1\n",
    "\n",
    "# Remove vertical padding above/below bars\n",
    "ax.margins(y=0)\n",
    "ax.set_ylim(-0.5, len(plot_df)-0.5)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"./output/annotator_agreement_detailed_human.png\", dpi=600, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fabe8f56",
   "metadata": {},
   "outputs": [],
   "source": [
    "from ast import literal_eval\n",
    "\n",
    "human_df = df[df['human']]\n",
    "def map_value(x):\n",
    "\tval = list(literal_eval(x).values())[0]\n",
    "\tif val == 10:\n",
    "\t\treturn 2\n",
    "\telif val == 5:\n",
    "\t\treturn 1\n",
    "\telse:\n",
    "\t\treturn 0\n",
    "\n",
    "human_df['value'] = human_df['emotions'].apply(map_value)\n",
    "\n",
    "# Group by audioId and check if all values are 1 or 2 for each group\n",
    "high_agreement = human_df.groupby(\"audioId\")[\"value\"].apply(lambda x: ((x == 1) | (x == 2)).all())\n",
    "\n",
    "# Count how many audioIds have high agreement\n",
    "num_high_agreement = high_agreement.sum()\n",
    "print(f\"Number of samples with high agreement (all annotators gave 1 or 2): {num_high_agreement}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8bd9f27",
   "metadata": {},
   "outputs": [],
   "source": [
    "human_df[[\"audioId\", \"annotator\", \"emotion_prompt\", \"value\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "955cb0ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import krippendorff\n",
    "\n",
    "df = human_df.copy()\n",
    "\n",
    "print(\"Original DataFrame head:\")\n",
    "print(df.head())\n",
    "print(\"\\n\")\n",
    "\n",
    "# --- Overall Krippendorff's Alpha ---\n",
    "\n",
    "# 1. Create a unique identifier for each item being rated (audioId + emotion_prompt)\n",
    "df['item_id'] = df['audioId'].astype(str) + '_' + df['emotion_prompt']\n",
    "\n",
    "# 2. Pivot the DataFrame to get annotators as columns and item_id as index\n",
    "#    This will have NaNs where an annotator didn't rate a specific item.\n",
    "reliability_data_overall = df.pivot_table(index='item_id', columns='annotator', values='value')\n",
    "\n",
    "# 3. Convert to a NumPy array and transpose it.\n",
    "#    Krippendorff's alpha expects data in the format (raters, items).\n",
    "#    The pivot table usually results in (items, raters), so we transpose.\n",
    "#    It's crucial to handle missing values; Krippendorff's alpha typically uses NaN.\n",
    "#    The .values converts to numpy array, .T transposes.\n",
    "data_matrix_overall = reliability_data_overall.values.T\n",
    "\n",
    "print(\"Reliability data matrix for overall calculation (transposed, raters x items):\\n\", data_matrix_overall)\n",
    "print(\"\\n\")\n",
    "\n",
    "# 4. Calculate Krippendorff's Alpha\n",
    "alpha_overall = krippendorff.alpha(data_matrix_overall, level_of_measurement='interval')\n",
    "\n",
    "print(f\"Overall Krippendorff's Alpha (level_of_measurement='interval'): {alpha_overall:.4f}\")\n",
    "print(\"-\" * 50)\n",
    "\n",
    "# --- Krippendorff's Alpha for Each Emotion Separately ---\n",
    "\n",
    "unique_emotions = df['emotion_prompt'].unique()\n",
    "print(\"\\nKrippendorff's Alpha for each emotion:\")\n",
    "\n",
    "for emotion in unique_emotions:\n",
    "    print(f\"\\n--- Emotion: {emotion} ---\")\n",
    "    # Filter DataFrame for the current emotion\n",
    "    df_emotion = df[df['emotion_prompt'] == emotion].copy()\n",
    "\n",
    "    # For a specific emotion, 'audioId' itself can serve as the unique item identifier.\n",
    "    # We still need to handle cases where an annotator might not have rated all audioIds for this emotion.\n",
    "    reliability_data_emotion = df_emotion.pivot_table(index='audioId', columns='annotator', values='value')\n",
    "\n",
    "    # If there's only one rater for this emotion, Krippendorff's alpha is undefined.\n",
    "    # Check if we have at least two annotators after pivoting.\n",
    "    if reliability_data_emotion.shape[1] < 2:\n",
    "        print(f\"  Warning: Only {reliability_data_emotion.shape[1]} annotator(s) rated '{emotion}' for these items.\")\n",
    "        print(\"  Krippendorff's Alpha requires at least two annotators to compare.\")\n",
    "        continue\n",
    "\n",
    "    # Convert to NumPy array and transpose (raters x items)\n",
    "    data_matrix_emotion = reliability_data_emotion.values.T\n",
    "    \n",
    "    # Check if there are any valid items to compare (i.e., items rated by more than one annotator)\n",
    "    # Krippendorff's alpha can return NaN if there's no disagreement possible (e.g., all identical ratings, or only one rater per item effectively)\n",
    "    # The library handles NaN automatically.\n",
    "    \n",
    "    try:\n",
    "        alpha_emotion = krippendorff.alpha(data_matrix_emotion, level_of_measurement='interval')\n",
    "        print(f\"  Krippendorff's Alpha for {emotion} (level_of_measurement='interval'): {alpha_emotion:.4f}\")\n",
    "    except ValueError as e:\n",
    "        print(f\"  Could not calculate alpha for {emotion}: {e}\")\n",
    "        print(\"  This can happen if there's insufficient data (e.g., only one rater, or no items with overlapping ratings).\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84f521b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "def krippendorff_alpha_bootstrap(data_matrix, n_bootstrap=1000, seed=42):\n",
    "    \"\"\"\n",
    "    Bootstrap confidence interval for Krippendorff's alpha.\n",
    "    data_matrix: numpy array (raters x items)\n",
    "    Returns: (lower, upper) 95% CI\n",
    "    \"\"\"\n",
    "    rng = np.random.default_rng(seed)\n",
    "    n_items = data_matrix.shape[1]\n",
    "    alphas = []\n",
    "    for _ in range(n_bootstrap):\n",
    "        # Sample columns (items) with replacement\n",
    "        idx = rng.integers(0, n_items, n_items)\n",
    "        sample = data_matrix[:, idx]\n",
    "        try:\n",
    "            a = krippendorff.alpha(sample, level_of_measurement='interval')\n",
    "        except Exception:\n",
    "            a = np.nan\n",
    "        alphas.append(a)\n",
    "    alphas = np.array(alphas)\n",
    "    # Remove NaNs if any\n",
    "    alphas = alphas[~np.isnan(alphas)]\n",
    "    if len(alphas) == 0:\n",
    "        return None, None\n",
    "    lower = np.percentile(alphas, 2.5)\n",
    "    upper = np.percentile(alphas, 97.5)\n",
    "    return lower, upper\n",
    "\n",
    "# Save per-emotion Krippendorff's alpha to CSV\n",
    "emotion_alpha_results = []\n",
    "for emotion in unique_emotions:\n",
    "    df_emotion = df[df['emotion_prompt'] == emotion].copy()\n",
    "    reliability_data_emotion = df_emotion.pivot_table(index='audioId', columns='annotator', values='value')\n",
    "    if reliability_data_emotion.shape[1] < 2:\n",
    "        emotion_alpha_results.append({\n",
    "            \"emotion\": emotion,\n",
    "            \"alpha\": None,\n",
    "            \"alpha_ci_lower\": None,\n",
    "            \"alpha_ci_upper\": None,\n",
    "            \"n_items\": reliability_data_emotion.shape[0]\n",
    "        })\n",
    "        continue\n",
    "    data_matrix_emotion = reliability_data_emotion.values.T\n",
    "    try:\n",
    "        alpha_emotion = krippendorff.alpha(data_matrix_emotion, level_of_measurement='interval')\n",
    "        ci_lower, ci_upper = krippendorff_alpha_bootstrap(data_matrix_emotion, n_bootstrap=1000, seed=42)\n",
    "    except Exception:\n",
    "        alpha_emotion = None\n",
    "        ci_lower, ci_upper = None, None\n",
    "    emotion_alpha_results.append({\n",
    "        \"emotion\": emotion,\n",
    "        \"alpha\": alpha_emotion,\n",
    "        \"alpha_ci_lower\": ci_lower,\n",
    "        \"alpha_ci_upper\": ci_upper,\n",
    "        \"n_items\": reliability_data_emotion.shape[0]\n",
    "    })\n",
    "\n",
    "# Overall alpha CI\n",
    "ci_lower, ci_upper = krippendorff_alpha_bootstrap(data_matrix_overall, n_bootstrap=1000, seed=42)\n",
    "emotion_alpha_results.append({\n",
    "    \"emotion\": \"Overall\",\n",
    "    \"alpha\": alpha_overall,\n",
    "    \"alpha_ci_lower\": ci_lower,\n",
    "    \"alpha_ci_upper\": ci_upper,\n",
    "    \"n_items\": reliability_data_overall.shape[0]\n",
    "})\n",
    "\n",
    "emotion_alpha_df = pd.DataFrame(emotion_alpha_results)\n",
    "emotion_alpha_df = emotion_alpha_df.sort_values(by=['alpha'], ascending=False)\n",
    "emotion_alpha_df.to_csv(\"./output/krippendorff_alpha_per_emotion.csv\", index=False)\n",
    "emotion_alpha_df.to_latex(\"./output/krippendorff_alpha_per_emotion.tex\", index=False, float_format=\"%.3f\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
