{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# calculate ROUGE-L for generated answers (gpt2-xl)\n",
    "\n",
    "This notebook computes ROUGE-L for each sample by comparing `LLM_output` in `/hy-tmp/dc/processed_data/dolly/full/generate_data/dolly-512/e10-bs4-lr5e-05-G2-N1-NN1/14290/10/answers.jsonl` with the reference `output` in `train.jsonl`. The two datasets are assumed to have the same ordering.\n",
    "\n",
    "- Source (generated): `/hy-tmp/dc/processed_data/dolly/full/generate_data/dolly-512/e10-bs4-lr5e-05-G2-N1-NN1/14290/10/answers.jsonl`\n",
    "- Source (ground truth): `/hy-tmp/dc/processed_data/dolly/full/gpt2/train.jsonl`\n",
    "- Output (updated): `/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl.new.jsonl`\n",
    "\n",
    "ROUGE-L calculation follows `utils.RougeL`, which uses `rouge_score.RougeScorer(['rougeL'], use_stemmer=True)` and returns the F-measure.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "please write a ipynb notebooks to re-rank the '/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl' based on \"prompt\" in '/hy-tmp/dc/processed_data/dolly/full/14290/answers_with_metrics_14290.new.jsonl', the re-ranked '/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl' have the same rank of '/hy-tmp/dc/processed_data/dolly/full/14290/answers_with_metrics_14290.new.jsonl'.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-15T12:06:45.104069Z",
     "iopub.status.busy": "2025-08-15T12:06:45.103238Z",
     "iopub.status.idle": "2025-08-15T12:06:45.687494Z",
     "shell.execute_reply": "2025-08-15T12:06:45.686735Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Previewing first entries:\n",
      "{'i': 0, 'gen_keys': ['prompt', 'LLM_output', 'lm_loss', 'Rouge_L'], 'gt_keys': ['instruction', 'prompt', 'input', 'output']}\n",
      "{'i': 1, 'gen_keys': ['prompt', 'LLM_output', 'lm_loss', 'Rouge_L'], 'gt_keys': ['instruction', 'prompt', 'input', 'output']}\n",
      "{'i': 2, 'gen_keys': ['prompt', 'LLM_output', 'lm_loss', 'Rouge_L'], 'gt_keys': ['instruction', 'prompt', 'input', 'output']}\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from typing import Iterator, Tuple, Dict, Any\n",
    "from rouge_score import rouge_scorer\n",
    "from pathlib import Path\n",
    "\n",
    "# File paths\n",
    "GENERATED_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl.jsonl')\n",
    "TRUTH_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/train.jsonl')\n",
    "OUTPUT_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl_1.jsonl')\n",
    "\n",
    "# ROUGE-L function as in utils.RougeL\n",
    "_def_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)\n",
    "\n",
    "def rouge_l(prediction: str, ground_truth: str) -> float:\n",
    "    scores = _def_scorer.score(prediction=prediction, target=ground_truth)\n",
    "    return float(scores['rougeL'].fmeasure)\n",
    "\n",
    "\n",
    "def stream_jsonl(path: Path) -> Iterator[Dict[str, Any]]:\n",
    "    with path.open('r') as f:\n",
    "        for line in f:\n",
    "            if not line.strip():\n",
    "                continue\n",
    "            yield json.loads(line)\n",
    "\n",
    "\n",
    "def preview_first_n(n: int = 3) -> None:\n",
    "    print('Previewing first entries:')\n",
    "    gen_iter = stream_jsonl(GENERATED_PATH)\n",
    "    gt_iter = stream_jsonl(TRUTH_PATH)\n",
    "    for i in range(n):\n",
    "        gen = next(gen_iter)\n",
    "        gt = next(gt_iter)\n",
    "        print({'i': i, 'gen_keys': list(gen.keys()), 'gt_keys': list(gt.keys())})\n",
    "\n",
    "preview_first_n(3)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-15T12:06:45.727835Z",
     "iopub.status.busy": "2025-08-15T12:06:45.726920Z",
     "iopub.status.idle": "2025-08-15T12:06:46.618941Z",
     "shell.execute_reply": "2025-08-15T12:06:46.618102Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows (base, xl): 11435 11435\n",
      "\n",
      "Summary (gpt2-base):\n",
      "                      mean  variance    median\n",
      "gpt2-base metric                              \n",
      "lm_loss           3.806960  7.322405  3.256193\n",
      "Rouge_L           0.115904  0.009221  0.102273\n",
      "\n",
      "Summary (gpt2-xl):\n",
      "                    mean  variance    median\n",
      "gpt2-xl metric                              \n",
      "lm_loss         2.990627  4.692519  2.605309\n",
      "Rouge_L         0.133526  0.011189  0.117647\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "from typing import Iterator, Dict, Any, List\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "BASE_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl')\n",
    "XL_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl.recalc.jsonl')\n",
    "\n",
    "sns.set(style='whitegrid')\n",
    "\n",
    "\n",
    "def stream_jsonl(path: Path) -> Iterator[Dict[str, Any]]:\n",
    "    with path.open('r') as f:\n",
    "        for line in f:\n",
    "            if not line.strip():\n",
    "                continue\n",
    "            yield json.loads(line)\n",
    "\n",
    "\n",
    "def load_metrics(path: Path) -> pd.DataFrame:\n",
    "    data: List[Dict[str, Any]] = []\n",
    "    for item in stream_jsonl(path):\n",
    "        data.append({\n",
    "            'lm_loss': float(item.get('lm_loss', np.nan)),\n",
    "            'Rouge_L': float(item.get('Rouge_L', np.nan)),\n",
    "        })\n",
    "    return pd.DataFrame(data)\n",
    "\n",
    "\n",
    "df_base = load_metrics(BASE_PATH)\n",
    "df_xl = load_metrics(XL_PATH)\n",
    "\n",
    "print('Rows (base, xl):', len(df_base), len(df_xl))\n",
    "\n",
    "def summarize(df: pd.DataFrame, name: str) -> pd.DataFrame:\n",
    "    summary = pd.DataFrame({\n",
    "        'mean': df[['lm_loss', 'Rouge_L']].mean(),\n",
    "        'variance': df[['lm_loss', 'Rouge_L']].var(ddof=1),\n",
    "        'median': df[['lm_loss', 'Rouge_L']].median(),\n",
    "    })\n",
    "    summary.index.name = f'{name} metric'\n",
    "    return summary\n",
    "\n",
    "summary_base = summarize(df_base, 'gpt2-base')\n",
    "summary_xl = summarize(df_xl, 'gpt2-xl')\n",
    "\n",
    "print('\\nSummary (gpt2-base):')\n",
    "print(summary_base)\n",
    "print('\\nSummary (gpt2-xl):')\n",
    "print(summary_xl)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-15T12:06:46.622918Z",
     "iopub.status.busy": "2025-08-15T12:06:46.622558Z",
     "iopub.status.idle": "2025-08-15T12:06:46.860238Z",
     "shell.execute_reply": "2025-08-15T12:06:46.859769Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows (base, xl): 11435 11435\n",
      "\n",
      "Summary (gpt2-base):\n",
      "                      mean  variance    median\n",
      "gpt2-base metric                              \n",
      "lm_loss           3.806960  7.322405  3.256193\n",
      "Rouge_L           0.115904  0.009221  0.102273\n",
      "\n",
      "Summary (gpt2-xl):\n",
      "                    mean  variance    median\n",
      "gpt2-xl metric                              \n",
      "lm_loss         2.990627  4.692519  2.605309\n",
      "Rouge_L         0.133526  0.011189  0.117647\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "from typing import Iterator, Dict, Any, List\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "BASE_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl')\n",
    "XL_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl.recalc.jsonl')\n",
    "\n",
    "sns.set(style='whitegrid')\n",
    "\n",
    "\n",
    "def stream_jsonl(path: Path) -> Iterator[Dict[str, Any]]:\n",
    "    with path.open('r') as f:\n",
    "        for line in f:\n",
    "            if not line.strip():\n",
    "                continue\n",
    "            yield json.loads(line)\n",
    "\n",
    "\n",
    "def load_metrics(path: Path) -> pd.DataFrame:\n",
    "    data: List[Dict[str, Any]] = []\n",
    "    for item in stream_jsonl(path):\n",
    "        data.append({\n",
    "            'lm_loss': float(item.get('lm_loss', np.nan)),\n",
    "            'Rouge_L': float(item.get('Rouge_L', np.nan)),\n",
    "        })\n",
    "    return pd.DataFrame(data)\n",
    "\n",
    "\n",
    "df_base = load_metrics(BASE_PATH)\n",
    "df_xl = load_metrics(XL_PATH)\n",
    "\n",
    "print('Rows (base, xl):', len(df_base), len(df_xl))\n",
    "\n",
    "def summarize(df: pd.DataFrame, name: str) -> pd.DataFrame:\n",
    "    summary = pd.DataFrame({\n",
    "        'mean': df[['lm_loss', 'Rouge_L']].mean(),\n",
    "        'variance': df[['lm_loss', 'Rouge_L']].var(ddof=1),\n",
    "        'median': df[['lm_loss', 'Rouge_L']].median(),\n",
    "    })\n",
    "    summary.index.name = f'{name} metric'\n",
    "    return summary\n",
    "\n",
    "summary_base = summarize(df_base, 'gpt2-base')\n",
    "summary_xl = summarize(df_xl, 'gpt2-xl')\n",
    "\n",
    "print('\\nSummary (gpt2-base):')\n",
    "print(summary_base)\n",
    "print('\\nSummary (gpt2-xl):')\n",
    "print(summary_xl)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-15T12:06:46.861877Z",
     "iopub.status.busy": "2025-08-15T12:06:46.861657Z",
     "iopub.status.idle": "2025-08-15T12:06:47.056272Z",
     "shell.execute_reply": "2025-08-15T12:06:47.055341Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows (base, xl): 11435 11435\n",
      "\n",
      "Summary (gpt2-base):\n",
      "                      mean  variance    median\n",
      "gpt2-base metric                              \n",
      "lm_loss           3.806960  7.322405  3.256193\n",
      "Rouge_L           0.115904  0.009221  0.102273\n",
      "\n",
      "Summary (gpt2-xl):\n",
      "                    mean  variance    median\n",
      "gpt2-xl metric                              \n",
      "lm_loss         2.990627  4.692519  2.605309\n",
      "Rouge_L         0.133526  0.011189  0.117647\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "from typing import Iterator, Dict, Any, List\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "BASE_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl')\n",
    "XL_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl.recalc.jsonl')\n",
    "\n",
    "sns.set(style='whitegrid')\n",
    "\n",
    "\n",
    "def stream_jsonl(path: Path) -> Iterator[Dict[str, Any]]:\n",
    "    with path.open('r') as f:\n",
    "        for line in f:\n",
    "            if not line.strip():\n",
    "                continue\n",
    "            yield json.loads(line)\n",
    "\n",
    "\n",
    "def load_metrics(path: Path) -> pd.DataFrame:\n",
    "    data: List[Dict[str, Any]] = []\n",
    "    for item in stream_jsonl(path):\n",
    "        data.append({\n",
    "            'lm_loss': float(item.get('lm_loss', np.nan)),\n",
    "            'Rouge_L': float(item.get('Rouge_L', np.nan)),\n",
    "        })\n",
    "    return pd.DataFrame(data)\n",
    "\n",
    "\n",
    "df_base = load_metrics(BASE_PATH)\n",
    "df_xl = load_metrics(XL_PATH)\n",
    "\n",
    "print('Rows (base, xl):', len(df_base), len(df_xl))\n",
    "\n",
    "def summarize(df: pd.DataFrame, name: str) -> pd.DataFrame:\n",
    "    summary = pd.DataFrame({\n",
    "        'mean': df[['lm_loss', 'Rouge_L']].mean(),\n",
    "        'variance': df[['lm_loss', 'Rouge_L']].var(ddof=1),\n",
    "        'median': df[['lm_loss', 'Rouge_L']].median(),\n",
    "    })\n",
    "    summary.index.name = f'{name} metric'\n",
    "    return summary\n",
    "\n",
    "summary_base = summarize(df_base, 'gpt2-base')\n",
    "summary_xl = summarize(df_xl, 'gpt2-xl')\n",
    "\n",
    "print('\\nSummary (gpt2-base):')\n",
    "print(summary_base)\n",
    "print('\\nSummary (gpt2-xl):')\n",
    "print(summary_xl)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-15T12:06:47.059658Z",
     "iopub.status.busy": "2025-08-15T12:06:47.059478Z",
     "iopub.status.idle": "2025-08-15T12:06:47.255138Z",
     "shell.execute_reply": "2025-08-15T12:06:47.254373Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows (base, xl): 11435 11435\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "Summary (gpt2-base):\n",
      "                      mean  variance    median\n",
      "gpt2-base metric                              \n",
      "lm_loss           3.806960  7.322405  3.256193\n",
      "Rouge_L           0.115904  0.009221  0.102273\n",
      "\n",
      "Summary (gpt2-xl):\n",
      "                    mean  variance    median\n",
      "gpt2-xl metric                              \n",
      "lm_loss         2.990627  4.692519  2.605309\n",
      "Rouge_L         0.133526  0.011189  0.117647\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "from typing import Iterator, Dict, Any, List\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "BASE_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl')\n",
    "XL_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl.recalc.jsonl')\n",
    "\n",
    "sns.set(style='whitegrid')\n",
    "\n",
    "\n",
    "def stream_jsonl(path: Path) -> Iterator[Dict[str, Any]]:\n",
    "    with path.open('r') as f:\n",
    "        for line in f:\n",
    "            if not line.strip():\n",
    "                continue\n",
    "            yield json.loads(line)\n",
    "\n",
    "\n",
    "def load_metrics(path: Path) -> pd.DataFrame:\n",
    "    data: List[Dict[str, Any]] = []\n",
    "    for item in stream_jsonl(path):\n",
    "        data.append({\n",
    "            'lm_loss': float(item.get('lm_loss', np.nan)),\n",
    "            'Rouge_L': float(item.get('Rouge_L', np.nan)),\n",
    "        })\n",
    "    return pd.DataFrame(data)\n",
    "\n",
    "\n",
    "df_base = load_metrics(BASE_PATH)\n",
    "df_xl = load_metrics(XL_PATH)\n",
    "\n",
    "print('Rows (base, xl):', len(df_base), len(df_xl))\n",
    "\n",
    "def summarize(df: pd.DataFrame, name: str) -> pd.DataFrame:\n",
    "    summary = pd.DataFrame({\n",
    "        'mean': df[['lm_loss', 'Rouge_L']].mean(),\n",
    "        'variance': df[['lm_loss', 'Rouge_L']].var(ddof=1),\n",
    "        'median': df[['lm_loss', 'Rouge_L']].median(),\n",
    "    })\n",
    "    summary.index.name = f'{name} metric'\n",
    "    return summary\n",
    "\n",
    "summary_base = summarize(df_base, 'gpt2-base')\n",
    "summary_xl = summarize(df_xl, 'gpt2-xl')\n",
    "\n",
    "print('\\nSummary (gpt2-base):')\n",
    "print(summary_base)\n",
    "print('\\nSummary (gpt2-xl):')\n",
    "print(summary_xl)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-15T12:06:47.257669Z",
     "iopub.status.busy": "2025-08-15T12:06:47.257453Z",
     "iopub.status.idle": "2025-08-15T12:06:47.440890Z",
     "shell.execute_reply": "2025-08-15T12:06:47.439971Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows (base, xl): 11435 11435\n",
      "\n",
      "Summary (gpt2-base):\n",
      "                      mean  variance    median\n",
      "gpt2-base metric                              \n",
      "lm_loss           3.806960  7.322405  3.256193\n",
      "Rouge_L           0.115904  0.009221  0.102273\n",
      "\n",
      "Summary (gpt2-xl):\n",
      "                    mean  variance    median\n",
      "gpt2-xl metric                              \n",
      "lm_loss         2.990627  4.692519  2.605309\n",
      "Rouge_L         0.133526  0.011189  0.117647\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from pathlib import Path\n",
    "from typing import Iterator, Dict, Any, List\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "BASE_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-base/answers_with_metrics_gpt2-base.recalc.jsonl')\n",
    "XL_PATH = Path('/hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl.recalc.jsonl')\n",
    "\n",
    "sns.set(style='whitegrid')\n",
    "\n",
    "\n",
    "def stream_jsonl(path: Path) -> Iterator[Dict[str, Any]]:\n",
    "    with path.open('r') as f:\n",
    "        for line in f:\n",
    "            if not line.strip():\n",
    "                continue\n",
    "            yield json.loads(line)\n",
    "\n",
    "\n",
    "def load_metrics(path: Path) -> pd.DataFrame:\n",
    "    data: List[Dict[str, Any]] = []\n",
    "    for item in stream_jsonl(path):\n",
    "        data.append({\n",
    "            'lm_loss': float(item.get('lm_loss', np.nan)),\n",
    "            'Rouge_L': float(item.get('Rouge_L', np.nan)),\n",
    "        })\n",
    "    return pd.DataFrame(data)\n",
    "\n",
    "\n",
    "df_base = load_metrics(BASE_PATH)\n",
    "df_xl = load_metrics(XL_PATH)\n",
    "\n",
    "print('Rows (base, xl):', len(df_base), len(df_xl))\n",
    "\n",
    "def summarize(df: pd.DataFrame, name: str) -> pd.DataFrame:\n",
    "    summary = pd.DataFrame({\n",
    "        'mean': df[['lm_loss', 'Rouge_L']].mean(),\n",
    "        'variance': df[['lm_loss', 'Rouge_L']].var(ddof=1),\n",
    "        'median': df[['lm_loss', 'Rouge_L']].median(),\n",
    "    })\n",
    "    summary.index.name = f'{name} metric'\n",
    "    return summary\n",
    "\n",
    "summary_base = summarize(df_base, 'gpt2-base')\n",
    "summary_xl = summarize(df_xl, 'gpt2-xl')\n",
    "\n",
    "print('\\nSummary (gpt2-base):')\n",
    "print(summary_base)\n",
    "print('\\nSummary (gpt2-xl):')\n",
    "print(summary_xl)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2025-08-15T12:06:47.443700Z",
     "iopub.status.busy": "2025-08-15T12:06:47.443435Z",
     "iopub.status.idle": "2025-08-15T12:07:21.442727Z",
     "shell.execute_reply": "2025-08-15T12:07:21.441528Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wrote 11435 lines to /hy-tmp/dc/processed_data/dolly/full/gpt2/gpt2-xl/answers_with_metrics_gpt2-xl_1.jsonl\n"
     ]
    }
   ],
   "source": [
    "from itertools import zip_longest\n",
    "\n",
    "\n",
    "def recalc_and_write(\n",
    "    gen_path: Path = GENERATED_PATH,\n",
    "    truth_path: Path = TRUTH_PATH,\n",
    "    out_path: Path = OUTPUT_PATH,\n",
    ") -> None:\n",
    "    gen_iter = stream_jsonl(gen_path)\n",
    "    truth_iter = stream_jsonl(truth_path)\n",
    "\n",
    "    count = 0\n",
    "    with out_path.open('w') as out_f:\n",
    "        for gen_item, truth_item in zip_longest(gen_iter, truth_iter):\n",
    "            if gen_item is None or truth_item is None:\n",
    "                raise ValueError('Files have different number of lines; cannot align by order.')\n",
    "\n",
    "            pred = gen_item.get('LLM_output', '')\n",
    "            tgt = truth_item.get('output', '')\n",
    "            score = rouge_l(pred, tgt)\n",
    "\n",
    "            # Copy and update\n",
    "            updated = dict(gen_item)\n",
    "            updated['Rouge_L'] = score\n",
    "\n",
    "            out_f.write(json.dumps(updated, ensure_ascii=False) + '\\n')\n",
    "            count += 1\n",
    "\n",
    "    print(f'Wrote {count} lines to {out_path}')\n",
    "\n",
    "\n",
    "recalc_and_write()\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
