{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "input_file = \"test.csv\"\n",
    "\n",
    "# Temporary list for all data rows\n",
    "records = []\n",
    "# Temporary dict for mean rate changes from summary lines\n",
    "summary_rates = {}\n",
    "\n",
    "current_head = None\n",
    "mode = None  # \"baseline\" or \"pruned\"\n",
    "\n",
    "with open(input_file, encoding='utf-8') as f:\n",
    "    for raw_line in f:\n",
    "        line = raw_line.strip()\n",
    "        if not line:\n",
    "            continue\n",
    "\n",
    "        # Start of Baseline section\n",
    "        if line.startswith(\"Round,CHAIR-s\"):\n",
    "            mode = \"baseline\"\n",
    "            continue\n",
    "\n",
    "        # Head Group separator, switch to Pruned mode\n",
    "        m_head = re.match(r\"^===== *(.+?) *=====$\", line)\n",
    "        if m_head:\n",
    "            current_head = m_head.group(1)\n",
    "            mode = \"pruned\"\n",
    "            continue\n",
    "\n",
    "        # 1) Parse Baseline line\n",
    "        if mode == \"baseline\" and re.match(r'^[0-9]+,', line):\n",
    "            rnd, chair_s, chair_i, f1, length = line.split(\",\")\n",
    "            records.append({\n",
    "                'HeadGroup': 'Baseline',\n",
    "                'Alpha':      np.nan,\n",
    "                'Beta':       np.nan,\n",
    "                'Round':      int(rnd),\n",
    "                'Type':       'Baseline',\n",
    "                'CHAIR-s':    float(chair_s),\n",
    "                'CHAIR-i':    float(chair_i),\n",
    "                'F1':         float(f1),\n",
    "                'Len':        float(length)\n",
    "            })\n",
    "            continue\n",
    "\n",
    "        # 2) Pruned average change rate summary line\n",
    "        if mode == \"pruned\" and line.startswith(\"α=\") and \"avg-ratio\" in line:\n",
    "            m_ab = re.match(r\"α=([\\d.]+),β=([\\d.]+)\", line)\n",
    "            if m_ab:\n",
    "                a = float(m_ab.group(1))\n",
    "                b = float(m_ab.group(2))\n",
    "                rest = line.split(\"avg-ratio:\")[1].strip()\n",
    "                parts = [p.strip() for p in rest.split(\",\")]\n",
    "                rates = {k: v for k, v in (p.split(\"=\") for p in parts)}\n",
    "                summary_rates[(current_head, a, b)] = rates\n",
    "            continue\n",
    "\n",
    "        # 3) Pruned data line\n",
    "        if mode == \"pruned\" and re.match(r'^[0-9]+\\.[0-9]+,', line):\n",
    "            parts = line.split(\",\")\n",
    "            a, b = map(float, parts[0:2])\n",
    "            rnd  = int(parts[2])\n",
    "            typ  = parts[3]\n",
    "            chair_s, chair_i, f1, length = map(float, parts[4:8])\n",
    "            records.append({\n",
    "                'HeadGroup': current_head,\n",
    "                'Alpha':      a,\n",
    "                'Beta':       b,\n",
    "                'Round':      rnd,\n",
    "                'Type':       typ,\n",
    "                'CHAIR-s':    chair_s,\n",
    "                'CHAIR-i':    chair_i,\n",
    "                'F1':         f1,\n",
    "                'Len':        length\n",
    "            })\n",
    "            continue\n",
    "\n",
    "# Raise error if no data was parsed\n",
    "if not records:\n",
    "    raise ValueError(\"No data rows parsed, please check input format.\")\n",
    "\n",
    "def percent_or_float(s):\n",
    "    if not s: return 0.0\n",
    "    if \"%\" in s:\n",
    "        return float(s.replace(\"%\", \"\"))\n",
    "    return float(s)\n",
    "\n",
    "# Build DataFrame\n",
    "df = pd.DataFrame(records)\n",
    "metrics = ['CHAIR-s','CHAIR-i','F1','Len']\n",
    "df[metrics] = df[metrics].astype(float)\n",
    "df['Alpha'] = df['Alpha'].astype(float)\n",
    "df['Beta']  = df['Beta'].astype(float)\n",
    "\n",
    "# Split Baseline and Pruned parts\n",
    "baseline_df = df[df['HeadGroup']=='Baseline']\n",
    "pruned_df   = df[df['HeadGroup']!='Baseline']\n",
    "\n",
    "# Compute mean±std for Baseline metrics\n",
    "baseline_stats = baseline_df[metrics].agg(['mean','std']).T\n",
    "baseline_fmt = {\n",
    "    m: f\"{baseline_stats.loc[m,'mean']:.4f}±{baseline_stats.loc[m,'std']:.4f}\"\n",
    "    for m in metrics\n",
    "}\n",
    "\n",
    "# For Pruned, group by HeadGroup+Alpha+Beta, compute mean±std and append Baseline & mean rate changes\n",
    "group_cols = ['HeadGroup','Alpha','Beta']\n",
    "rows = []\n",
    "for (head, a, b), grp in pruned_df.groupby(group_cols):\n",
    "    entry = {'HeadGroup': head, 'Alpha': a, 'Beta': b}\n",
    "    # Mean±std for Pruned group itself\n",
    "    for m in metrics:\n",
    "        mean = grp[m].mean()\n",
    "        std  = grp[m].std()\n",
    "        entry[f'{m} (μ±σ)'] = f\"{mean:.4f}±{std:.4f}\"\n",
    "    # Mean±std for Baseline (same fixed value, just reuse)\n",
    "    for m in metrics:\n",
    "        entry[f'Baseline {m} (μ±σ)'] = baseline_fmt[m]\n",
    "    # Mean rate changes\n",
    "    rates = summary_rates.get((head, a, b), {})\n",
    "    entry['ΔCHAIR-s'] = rates.get('CHAIR-s', '')\n",
    "    entry['ΔCHAIR-i'] = rates.get('CHAIR-i', '')\n",
    "    entry['ΔF1']      = rates.get('F1', '')\n",
    "    entry['ΔLen']     = rates.get('Len', '')\n",
    "    rows.append(entry)\n",
    "\n",
    "out_df = pd.DataFrame(rows)\n",
    "\n",
    "# Compute sorting field: sum of ΔCHAIR-s and ΔCHAIR-i\n",
    "def extract_mean(s: str) -> float:\n",
    "    return float(s.split('±')[0])\n",
    "\n",
    "out_df['ΔCHAIR-s(f)'] = out_df['ΔCHAIR-s'].apply(percent_or_float)\n",
    "out_df['ΔCHAIR-i(f)'] = out_df['ΔCHAIR-i'].apply(percent_or_float)\n",
    "\n",
    "# Joint improvement (the more negative, the larger the total reduction, so sort by -(ΔCHAIR-s + ΔCHAIR-i), i.e., largest decrease first)\n",
    "out_df['_sum_delta'] = out_df['ΔCHAIR-s(f)'] + out_df['ΔCHAIR-i(f)']\n",
    "\n",
    "# Output top 16 for each HeadGroup\n",
    "for head, sub in out_df.sort_values('_sum_delta').groupby('HeadGroup'):\n",
    "    top16 = sub.nsmallest(16, '_sum_delta').drop(columns=['_sum_delta','ΔCHAIR-s(f)','ΔCHAIR-i(f)'])\n",
    "    print(f\"\\n##### {head}: Top 16 Groups with Largest Joint Reduction in ΔCHAIR-s + ΔCHAIR-i (the more negative, the better)\")\n",
    "    print(top16[\n",
    "        ['Alpha','Beta',\n",
    "         'ΔCHAIR-s','ΔCHAIR-i','ΔF1','ΔLen',\n",
    "         'CHAIR-s (μ±σ)','CHAIR-i (μ±σ)',\n",
    "         'F1 (μ±σ)','Len (μ±σ)',\n",
    "        'Baseline CHAIR-s (μ±σ)','Baseline CHAIR-i (μ±σ)',\n",
    "         'Baseline F1 (μ±σ)',     'Baseline Len (μ±σ)',]\n",
    "    ].to_markdown(index=False))\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cir",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
