{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Recalculate ROUGE-L for generated answers\n",
    "\n",
    "This notebook recomputes ROUGE-L for each sample by comparing `LLM_output` in `answers_with_metrics_gpt2-base.jsonl` with the reference `output` in `train.jsonl`. The two datasets are assumed to have the same ordering.\n",
    "\n",
    "- Source (generated): `/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.jsonl`\n",
    "- Source (ground truth): `/hy-tmp/dc/processed_data/dolly/full/gpt2/train.jsonl`\n",
    "- Output (updated): `/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl`\n",
    "\n",
    "ROUGE-L calculation follows `utils.RougeL`, which uses `rouge_score.RougeScorer(['rougeL'], use_stemmer=True)` and returns the F-measure.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-14T07:57:30.011244Z",
     "iopub.status.busy": "2025-08-14T07:57:30.010388Z",
     "iopub.status.idle": "2025-08-14T07:57:30.671064Z",
     "shell.execute_reply": "2025-08-14T07:57:30.670122Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previewing first entries:\n",
      "{'i': 0, 'gen_keys': ['prompt', 'LLM_output', 'lm_loss', 'Rouge_L'], 'gt_keys': ['instruction', 'prompt', 'input', 'output']}\n",
      "{'i': 1, 'gen_keys': ['prompt', 'LLM_output', 'lm_loss', 'Rouge_L'], 'gt_keys': ['instruction', 'prompt', 'input', 'output']}\n",
      "{'i': 2, 'gen_keys': ['prompt', 'LLM_output', 'lm_loss', 'Rouge_L'], 'gt_keys': ['instruction', 'prompt', 'input', 'output']}\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from typing import Iterator, Tuple, Dict, Any\n",
    "from rouge_score import rouge_scorer\n",
    "from pathlib import Path\n",
    "\n",
    "# File paths\n",
    "GENERATED_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.jsonl')\n",
    "TRUTH_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/train.jsonl')\n",
    "OUTPUT_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl')\n",
    "\n",
    "# ROUGE-L function as in utils.RougeL\n",
    "_def_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n",
    "\n",
    "def rouge_l(prediction: str, ground_truth: str) -> float:\n",
    "    scores = _def_scorer.score(prediction=prediction, target=ground_truth)\n",
    "    return float(scores['rougeL'].fmeasure)\n",
    "\n",
    "\n",
    "def stream_jsonl(path: Path) -> Iterator[Dict[str, Any]]:\n",
    "    with path.open('r') as f:\n",
    "        for line in f:\n",
    "            if not line.strip():\n",
    "                continue\n",
    "            yield json.loads(line)\n",
    "\n",
    "\n",
    "def preview_first_n(n: int = 3) -> None:\n",
    "    print('Previewing first entries:')\n",
    "    gen_iter = stream_jsonl(GENERATED_PATH)\n",
    "    gt_iter = stream_jsonl(TRUTH_PATH)\n",
    "    for i in range(n):\n",
    "        gen = next(gen_iter)\n",
    "        gt = next(gt_iter)\n",
    "        print({'i': i, 'gen_keys': list(gen.keys()), 'gt_keys': list(gt.keys())})\n",
    "\n",
    "preview_first_n(3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-14T07:57:30.747785Z",
     "iopub.status.busy": "2025-08-14T07:57:30.747467Z",
     "iopub.status.idle": "2025-08-14T07:58:09.901504Z",
     "shell.execute_reply": "2025-08-14T07:58:09.900426Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wrote 11435 lines to /hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl\n"
     ]
    }
   ],
   "source": [
    "from itertools import zip_longest\n",
    "\n",
    "\n",
    "def recalc_and_write(\n",
    "    gen_path: Path = GENERATED_PATH,\n",
    "    truth_path: Path = TRUTH_PATH,\n",
    "    out_path: Path = OUTPUT_PATH,\n",
    ") -> None:\n",
    "    gen_iter = stream_jsonl(gen_path)\n",
    "    truth_iter = stream_jsonl(truth_path)\n",
    "\n",
    "    count = 0\n",
    "    with out_path.open('w') as out_f:\n",
    "        for gen_item, truth_item in zip_longest(gen_iter, truth_iter):\n",
    "            if gen_item is None or truth_item is None:\n",
    "                raise ValueError('Files have different number of lines; cannot align by order.')\n",
    "\n",
    "            pred = gen_item.get('LLM_output', '')\n",
    "            tgt = truth_item.get('output', '')\n",
    "            score = rouge_l(pred, tgt)\n",
    "\n",
    "            # Copy and update\n",
    "            updated = dict(gen_item)\n",
    "            updated['Rouge_L'] = score\n",
    "\n",
    "            out_f.write(json.dumps(updated, ensure_ascii=False) + '\\n')\n",
    "            count += 1\n",
    "\n",
    "    print(f'Wrote {count} lines to {out_path}')\n",
    "\n",
    "\n",
    "recalc_and_write()\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
