{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "import sys\n",
    "import numpy as np\n",
    "from collections import defaultdict\n",
    "\n",
    "parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
    "sys.path.insert(0, parent_dir)\n",
    "from chair_metrics import batch_compute_chair_metrics\n",
    "\n",
    "records = defaultdict(dict)  # {file_name: {\"baseline\": (pred, gt), \"pruned\": (pred, gt)}}\n",
    "line_count = 0\n",
    "with open('content_log_7b_5_18_hallu_retry-autojudge.jsonl', 'r', encoding='utf-8') as f:\n",
    "    for line in f:\n",
    "        line_count += 1\n",
    "        try:\n",
    "            entry = json.loads(line)\n",
    "        except json.JSONDecodeError as e:\n",
    "            print(f\"[WARN] Line {line_count}: JSON parsing failed, skipping: {e}\")\n",
    "            continue\n",
    "        typ = entry.get('type')\n",
    "        fname = entry.get('file_name')\n",
    "        pred = entry.get('pred', '').replace('Please describe the image in detail.', '').strip()\n",
    "        gt   = entry.get('gt_label', [])\n",
    "        if not typ or not fname:\n",
    "            print(f\"[WARN] Line {line_count}: Missing 'type' or 'file_name', content: {entry}\")\n",
    "            continue\n",
    "        records[fname][typ] = (pred, gt)\n",
    "\n",
    "print(f\"[DEBUG] Read a total of {line_count} lines, collected records for {len(records)} images.\")\n",
    "\n",
    "metrics = ['CHAIR-s', 'CHAIR-i', 'F1']\n",
    "all_baseline = {m: [] for m in metrics}\n",
    "all_pruned   = {m: [] for m in metrics}\n",
    "\n",
    "eval_count = 0\n",
    "skip_count = 0\n",
    "\n",
    "for fname, preds in records.items():\n",
    "    types = list(preds.keys())\n",
    "    print(f\"\\n[DEBUG] Processing file {fname}, record types: {types}\")\n",
    "\n",
    "    if 'baseline' not in preds or 'pruned' not in preds:\n",
    "        print(f\"[INFO] Incomplete record, skipping (only has {types})\")\n",
    "        skip_count += 1\n",
    "        continue\n",
    "\n",
    "    pred_b, gt_b = preds['baseline']\n",
    "    pred_p, gt_p = preds['pruned']\n",
    "\n",
    "    # Check for SKIP\n",
    "    if pred_b.strip().upper() == 'SKIP' or pred_p.strip().upper() == 'SKIP':\n",
    "        print(f\"[INFO] 'baseline' or 'pruned' is SKIP, skipping\")\n",
    "        skip_count += 1\n",
    "        continue\n",
    "\n",
    "    # Evaluate baseline\n",
    "    result_b = batch_compute_chair_metrics([pred_b], [gt_b])\n",
    "    # print(f\"[DEBUG] {fname} baseline evaluation: { {k: result_b[k] for k in metrics} }\")\n",
    "    # Evaluate pruned\n",
    "    result_p = batch_compute_chair_metrics([pred_p], [gt_p])\n",
    "    # print(f\"[DEBUG] {fname} pruned evaluation: { {k: result_p[k] for k in metrics} }\")\n",
    "\n",
    "    # Accumulate\n",
    "    for m in metrics:\n",
    "        all_baseline[m].append(result_b[m])\n",
    "        all_pruned[m].append(result_p[m])\n",
    "    eval_count += 1\n",
    "\n",
    "print(f\"\\n=== Final Statistics ===\")\n",
    "print(f\"Total files: {len(records)}, evaluated pairs: {eval_count}, skipped: {skip_count}\\n\")\n",
    "\n",
    "# 3. Compute and print averages and Delta\n",
    "print('=== Evaluation Results ===')\n",
    "for m in metrics:\n",
    "    if all_baseline[m]:\n",
    "        baseline_avg = np.mean(all_baseline[m])\n",
    "        pruned_avg   = np.mean(all_pruned[m])\n",
    "        delta        = baseline_avg - pruned_avg\n",
    "        print(f\"{m}: baseline={baseline_avg:.4f}, pruned={pruned_avg:.4f}, delta={delta:+.4f}\")\n",
    "    else:\n",
    "        print(f\"{m}: No data to evaluate\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import os\n",
    "import sys\n",
    "import numpy as np\n",
    "from collections import defaultdict\n",
    "\n",
    "# Import evaluation function\n",
    "parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
    "sys.path.insert(0, parent_dir)\n",
    "from chair_metrics import batch_compute_chair_metrics\n",
    "\n",
    "# Read jsonl\n",
    "records = defaultdict(lambda: defaultdict(dict))  # {exp: {file_name: {\"baseline\": (pred, gt), \"pruned\": (pred, gt)}}}\n",
    "\n",
    "with open('content_log_7b_5_18_hallu_retry-73-autojudge-bias.jsonl', 'r', encoding='utf-8') as f:\n",
    "    for line in f:\n",
    "        entry = json.loads(line)\n",
    "        exp = entry['experiment']\n",
    "        typ = entry['type']\n",
    "        fname = entry['file_name']\n",
    "        pred = entry['pred'].replace('Please describe the image in detail.', '').strip()\n",
    "        gt = entry['gt_label']\n",
    "        records[exp][fname][typ] = (pred, gt)\n",
    "\n",
    "metrics = ['CHAIR-s', 'CHAIR-i', 'F1']\n",
    "\n",
    "print('=== Groups with Gains (Decrease in CHAIR-s / CHAIR-i means gain) ===')\n",
    "for exp, file_records in records.items():\n",
    "    all_baseline = {m: [] for m in metrics}\n",
    "    all_pruned = {m: [] for m in metrics}\n",
    "    for fname, preds in file_records.items():\n",
    "        if 'baseline' in preds and 'pruned' in preds:\n",
    "            pred_b, gt_b = preds['baseline']\n",
    "            pred_p, gt_p = preds['pruned']\n",
    "            result_b = batch_compute_chair_metrics([pred_b], [gt_b])\n",
    "            result_p = batch_compute_chair_metrics([pred_p], [gt_p])\n",
    "            for m in metrics:\n",
    "                all_baseline[m].append(result_b[m])\n",
    "                all_pruned[m].append(result_p[m])\n",
    "    # Only proceed if there is data\n",
    "    if len(all_baseline['CHAIR-s']) == 0:\n",
    "        continue\n",
    "    # Mean values\n",
    "    chair_s_baseline = np.mean(all_baseline['CHAIR-s'])\n",
    "    chair_s_pruned   = np.mean(all_pruned['CHAIR-s'])\n",
    "    chair_i_baseline = np.mean(all_baseline['CHAIR-i'])\n",
    "    chair_i_pruned   = np.mean(all_pruned['CHAIR-i'])\n",
    "    # Determine gain\n",
    "    chair_s_delta = chair_s_baseline - chair_s_pruned\n",
    "    chair_i_delta = chair_i_baseline - chair_i_pruned\n",
    "    gain_flag = False\n",
    "    if chair_s_delta > 0 or chair_i_delta > 0:\n",
    "        gain_flag = True\n",
    "    if gain_flag:\n",
    "        print(f'Experiment group {exp}:')\n",
    "        if chair_s_delta > 0:\n",
    "            print(f'    CHAIR-s gain: {chair_s_delta:+.4f} (baseline={chair_s_baseline:.4f}, pruned={chair_s_pruned:.4f})')\n",
    "        if chair_i_delta > 0:\n",
    "            print(f'    CHAIR-i gain: {chair_i_delta:+.4f} (baseline={chair_i_baseline:.4f}, pruned={chair_i_pruned:.4f})')\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "cir",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
