{
 "cells": [
  {
   "cell_type": "code",
   "id": "ec74c3b7-2457-4b28-99ff-9f7439a12388",
   "metadata": {},
   "source": [
    "from typing import List, Tuple, Dict\n",
    "from bert_score import score as bert_score\n",
    "from collections import Counter\n",
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\"\n",
    "def is_position_match(pred_pos: Tuple[int, int], gold_pos: Tuple[int, int]) -> bool:\n",
    "    return pred_pos == gold_pos\n",
    "\n",
    "def evaluate_missing_prediction(\n",
    "    gold: Dict[str, List],\n",
    "    pred: Dict[str, List],\n",
    "    model_type: str = \"roberta-large\",\n",
    "    lang: str = \"en\",\n",
    ") -> Dict[str, float]:\n",
    "\n",
    "    gold_positions = gold[\"positions\"]\n",
    "    gold_texts = gold[\"texts\"]\n",
    "    pred_positions = pred[\"positions\"]\n",
    "    pred_texts = pred[\"texts\"]\n",
    "\n",
    "    if len(gold_positions) == 0:\n",
    "        if len(pred_positions) == 0:\n",
    "            return {\n",
    "                \"precision_pos\": 1.0,\n",
    "                \"recall_pos\": 1.0,\n",
    "                \"f1_pos\": 1.0,\n",
    "                \"redundancy_rate\": 0.0,\n",
    "                \"text_score_position_aware\": 1.0 \n",
    "            }\n",
    "        else:\n",
    "            return {\n",
    "                \"precision_pos\": 0.0,\n",
    "                \"recall_pos\": 1.0,\n",
    "                \"f1_pos\": 0.0,\n",
    "                \"redundancy_rate\": 1.0,\n",
    "                \"text_score_position_aware\": 0.0\n",
    "            }\n",
    "\n",
    "    matched = []\n",
    "    unmatched_pred_indices = set(range(len(pred_positions)))\n",
    "    matched_texts_pred = []\n",
    "    matched_texts_gold = []\n",
    "\n",
    "    for i, gpos in enumerate(gold_positions):\n",
    "        matched_flag = False\n",
    "        for j, ppos in enumerate(pred_positions):\n",
    "            if is_position_match(ppos, gpos):\n",
    "                matched.append((i, j))\n",
    "                matched_texts_gold.append(gold_texts[i])\n",
    "                matched_texts_pred.append(pred_texts[j])\n",
    "                unmatched_pred_indices.discard(j)\n",
    "                matched_flag = True\n",
    "                break \n",
    "        if not matched_flag:\n",
    "            matched_texts_gold.append(gold_texts[i])\n",
    "            matched_texts_pred.append(\"\")\n",
    "\n",
    "    tp = len(matched)\n",
    "    fp = len(unmatched_pred_indices)\n",
    "    fn = len(gold_positions) - tp\n",
    "\n",
    "    precision_pos = tp / (tp + fp) if (tp + fp) > 0 else 0.0\n",
    "    recall_pos = tp / (tp + fn) if (tp + fn) > 0 else 0.0\n",
    "    f1_pos = (2 * precision_pos * recall_pos / (precision_pos + recall_pos)) if (precision_pos + recall_pos) > 0 else 0.0\n",
    "    redundancy_rate = fp / (tp + fp) if (tp + fp) > 0 else 0.0\n",
    "\n",
    "    P, R, F1 = bert_score(matched_texts_pred, matched_texts_gold, lang=lang, model_type=model_type, verbose=False)\n",
    "    text_score = float(F1.mean().item()) \n",
    "\n",
    "    return {\n",
    "        \"precision_pos\": round(precision_pos, 4),\n",
    "        \"recall_pos\": round(recall_pos, 4),\n",
    "        \"f1_pos\": round(f1_pos, 4),\n",
    "        \"redundancy_rate\": round(redundancy_rate, 4),\n",
    "        \"text_score_position_aware\": round(text_score, 4),\n",
    "    }\n"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "bc9b7696-577b-4fba-9dac-88f94690e27a",
   "metadata": {},
   "source": [
    "from bert_score import score as bert_score\n",
    "from tqdm import tqdm\n",
    "\n",
    "def batch_evaluate(\n",
    "    gold_list: List[Dict[str, List]],\n",
    "    pred_list: List[Dict[str, List]],\n",
    "    model_type: str = \"roberta-large\",\n",
    "    lang: str = \"en\",\n",
    "    show_progress: bool = True,\n",
    ") -> Dict[str, float]:\n",
    "    assert len(gold_list) == len(pred_list), \"gold/pred inconsistent\"\n",
    "\n",
    "    metrics_sum = {\n",
    "        \"precision_pos\": 0.0,\n",
    "        \"recall_pos\": 0.0,\n",
    "        \"f1_pos\": 0.0,\n",
    "        \"redundancy_rate\": 0.0,\n",
    "    }\n",
    "\n",
    "    all_gold_texts = []\n",
    "    all_pred_texts = []\n",
    "    per_sample_matched_counts = []\n",
    "    n = len(gold_list)\n",
    "\n",
    "    iterable = zip(gold_list, pred_list)\n",
    "    if show_progress:\n",
    "        iterable = tqdm(iterable, total=n, desc=\"Evaluating\")\n",
    "\n",
    "    for gold, pred in iterable:\n",
    "        gold_pos, pred_pos = gold[\"positions\"], pred[\"positions\"]\n",
    "        gold_texts, pred_texts = gold[\"texts\"], pred[\"texts\"]\n",
    "\n",
    "        matched = []\n",
    "        for i, g in enumerate(gold_pos):\n",
    "            for j, p in enumerate(pred_pos):\n",
    "                if g == p:\n",
    "                    matched.append((i, j))\n",
    "                    break \n",
    "\n",
    "        matched_gold_texts = [gold_texts[i] for i, _ in matched]\n",
    "        matched_pred_texts = [pred_texts[j] for _, j in matched]\n",
    "\n",
    "        all_gold_texts.extend(matched_gold_texts)\n",
    "        all_pred_texts.extend(matched_pred_texts)\n",
    "        per_sample_matched_counts.append(len(matched))\n",
    "\n",
    "        true_positives = len(matched)\n",
    "        precision_pos = true_positives / len(pred_pos) if pred_pos else 0.0\n",
    "        recall_pos = true_positives / len(gold_pos) if gold_pos else 0.0\n",
    "        f1_pos = (\n",
    "            2 * precision_pos * recall_pos / (precision_pos + recall_pos)\n",
    "            if precision_pos + recall_pos > 0 else 0.0\n",
    "        )\n",
    "        redundancy_rate = (len(pred_pos) - true_positives) / len(pred_pos) if pred_pos else 0.0\n",
    "\n",
    "        metrics_sum[\"precision_pos\"] += precision_pos\n",
    "        metrics_sum[\"recall_pos\"] += recall_pos\n",
    "        metrics_sum[\"f1_pos\"] += f1_pos\n",
    "        metrics_sum[\"redundancy_rate\"] += redundancy_rate\n",
    "\n",
    "    if all_gold_texts:\n",
    "        _, _, F1 = bert_score(\n",
    "            all_pred_texts, all_gold_texts,\n",
    "            lang=lang, model_type=model_type, verbose=False, device=\"cuda\"\n",
    "        )\n",
    "\n",
    "        f1_list = F1.tolist()\n",
    "        idx = 0\n",
    "        sample_text_scores = []\n",
    "        for count in per_sample_matched_counts:\n",
    "            if count == 0:\n",
    "                sample_text_scores.append(None)\n",
    "            else:\n",
    "                sample_f1 = sum(f1_list[idx:idx + count]) / len(gold_list[sample_text_scores.__len__()][\"texts\"])  # 除以gold中应有的总数\n",
    "                sample_text_scores.append(sample_f1)\n",
    "                idx += count\n",
    "        avg_text_score = round(sum(f for f in sample_text_scores if f is not None) / n, 4)\n",
    "    else:\n",
    "        avg_text_score = None\n",
    "\n",
    "    avg_metrics = {k: round(v / n, 4) for k, v in metrics_sum.items()}\n",
    "    avg_metrics[\"text_score_position_aware\"] = avg_text_score\n",
    "\n",
    "    return avg_metrics\n"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "fd38d204-086e-4bcd-9c68-59e5f19d4f8e",
   "metadata": {},
   "source": [
    "def extract_missing_steps(text, eval_model):\n",
    "    if eval_model==\"ours\":\n",
    "        pattern = re.compile(\n",
    "            r'Missing Step (\\d+)：\\s*'\n",
    "            r'The missing step should be placed between Step (\\d+) and Step (\\d+)\\.\\s*'\n",
    "            r'The missing step is:\\s*(.*?)'\n",
    "            r'(?=\\s*Missing Step \\d+：|\\Z)',\n",
    "            re.DOTALL\n",
    "        )\n",
    "    if eval_model==\"72b\" or eval_model==\"7b\":\n",
    "        pattern = re.compile(\n",
    "            r'Missing Step (\\d+):\\s*'\n",
    "            r'The missing step should be placed between Step (\\d+) and Step (\\d+)\\.\\s*'\n",
    "            r'The missing step is:\\s*(.*?)'\n",
    "            r'(?=\\s*Missing Step \\d+:|\\Z)',\n",
    "            re.DOTALL\n",
    "        )\n",
    "    matches = pattern.findall(text)\n",
    "    results = []\n",
    "    for match in matches:\n",
    "        a, x, y, z = match\n",
    "        results.append({\n",
    "            'a': a.strip(),\n",
    "            'x': x.strip(),\n",
    "            'y': y.strip(),\n",
    "            'z': z.strip()\n",
    "        })\n",
    "    return results\n",
    "\n",
    "def filter_results(results):\n",
    "    filtered_results = []\n",
    "    for result in results:\n",
    "        x = int(result['x'])\n",
    "        y = int(result['y'])\n",
    "        z = result['z']\n",
    "        if x + 1 == y and not z.startswith('####'):\n",
    "            filtered_results.append(result)\n",
    "    return filtered_results\n",
    "\n",
    "def sort_results_by_x(results):\n",
    "    sorted_results = sorted(results, key=lambda result: int(result['x']))\n",
    "    return sorted_results\n",
    "\n",
    "def process_text(text, eval_model):\n",
    "    results = extract_missing_steps(text, eval_model=eval_model)\n",
    "    filtered_results = filter_results(results)\n",
    "    sorted_results = sort_results_by_x(filtered_results)\n",
    "    return sorted_results\n",
    "\n"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "654b42ab-1efb-4555-b1ec-959a03e1002f",
   "metadata": {},
   "source": [
    "import re\n",
    "import pickle\n",
    "with open('results-72b.pkl', 'rb') as f:\n",
    "    loaded_results = pickle.load(f)\n",
    "with open('gold.pkl', 'rb') as f:\n",
    "    gold = pickle.load(f)\n",
    "predicts = []\n",
    "for i in range(len(loaded_results)):\n",
    "    data = process_text(loaded_results[i], eval_model=\"72b\")\n",
    "    \n",
    "    predict = {\n",
    "        \"positions\": [(int(item[\"x\"]), int(item[\"y\"])) for item in data],\n",
    "        \"texts\": [item[\"z\"] for item in data]\n",
    "    }\n",
    "    \n",
    "    predicts.append(predict)\n",
    "results = batch_evaluate(gold, predicts)\n",
    "print(results)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "99a6bdb6-587e-45c4-8852-2da32ae769dd",
   "metadata": {},
   "source": [
    "import re\n",
    "import pickle\n",
    "with open('results-7b.pkl', 'rb') as f:\n",
    "    loaded_results = pickle.load(f)\n",
    "with open('gold.pkl', 'rb') as f:\n",
    "    gold = pickle.load(f)\n",
    "predicts = []\n",
    "for i in range(len(loaded_results)):\n",
    "    data = process_text(loaded_results[i], eval_model=\"7b\")\n",
    "    \n",
    "    predict = {\n",
    "        \"positions\": [(int(item[\"x\"]), int(item[\"y\"])) for item in data],\n",
    "        \"texts\": [item[\"z\"] for item in data]\n",
    "    }\n",
    "    \n",
    "    predicts.append(predict)\n",
    "results = batch_evaluate(gold, predicts)\n",
    "print(results)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "307ee3c0-eaba-4ee9-8d54-22767d8f9dbf",
   "metadata": {},
   "source": [
    "import re\n",
    "import pickle\n",
    "with open('results-ours.pkl', 'rb') as f:\n",
    "    loaded_results = pickle.load(f)\n",
    "with open('gold.pkl', 'rb') as f:\n",
    "    gold = pickle.load(f)\n",
    "predicts = []\n",
    "for i in range(len(loaded_results)):\n",
    "    data = process_text(loaded_results[i], eval_model=\"ours\")\n",
    "    \n",
    "    predict = {\n",
    "        \"positions\": [(int(item[\"x\"]), int(item[\"y\"])) for item in data],\n",
    "        \"texts\": [item[\"z\"] for item in data]\n",
    "    }\n",
    "    \n",
    "    predicts.append(predict)\n",
    "results = batch_evaluate(gold, predicts)\n",
    "print(results)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "13710b2f-138f-4621-bf30-4cca584ce46e",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
