{
 "cells": [
  {
   "cell_type": "code",
   "id": "e1b5c8a3-7b7d-4224-8d24-98570571f894",
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "df = pd.read_json(\"ScaleQM+_test.json\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "60e6d311-5779-4244-acec-f210ce2579d6",
   "metadata": {},
   "source": [
    "def extract_missing_steps(text):\n",
    "    pattern = re.compile(\n",
    "        r'Missing Step (\\d+)：\\s*'\n",
    "        r'The missing step should be placed between Step (\\d+) and Step (\\d+)\\.\\s*'\n",
    "        r'The missing step is:\\s*(.*?)'\n",
    "        r'(?=\\s*Missing Step \\d+：|\\Z)',\n",
    "        re.DOTALL\n",
    "    )\n",
    "    matches = pattern.findall(text)\n",
    "    results = []\n",
    "    for match in matches:\n",
    "        a, x, y, z = match\n",
    "        results.append({\n",
    "            'a': a.strip(),\n",
    "            'x': x.strip(),\n",
    "            'y': y.strip(),\n",
    "            'z': z.strip()\n",
    "        })\n",
    "    return results\n",
    "\n",
    "def filter_results(results):\n",
    "    filtered_results = []\n",
    "    for result in results:\n",
    "        x = int(result['x'])\n",
    "        y = int(result['y']) \n",
    "        z = result['z']\n",
    "        if x + 1 == y and not z.startswith('####'):\n",
    "            filtered_results.append(result)\n",
    "    return filtered_results\n",
    "\n",
    "def sort_results_by_x(results):\n",
    "    sorted_results = sorted(results, key=lambda result: int(result['x']))\n",
    "    return sorted_results\n",
    "\n",
    "def process_text(text):\n",
    "    results = extract_missing_steps(text)\n",
    "    filtered_results = filter_results(results)\n",
    "    sorted_results = sort_results_by_x(filtered_results)\n",
    "    return sorted_results\n",
    "\n"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "f03a9b07-ab6e-4546-b7de-aab71670c7da",
   "metadata": {},
   "source": [
    "import re\n",
    "results = []\n",
    "for i in range(len(df)):\n",
    "    data = process_text(df.iloc[i][\"messages\"][2][\"content\"])\n",
    "    \n",
    "    result = {\n",
    "        \"positions\": [(int(item[\"x\"]), int(item[\"y\"])) for item in data],\n",
    "        \"texts\": [item[\"z\"] for item in data]\n",
    "    }\n",
    "    \n",
    "    results.append(result)\n"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "033915cc-ceca-4aa6-9df8-78ba3ab0a6a6",
   "metadata": {},
   "source": [
    "import pickle\n",
    "with open('gold.pkl', 'wb') as f:\n",
    "    pickle.dump(results, f)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "58b04a89-fdcc-44d2-95a6-7538b16c693c",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
