{
 "cells": [
  {
   "cell_type": "code",
   "id": "0bec98cb-d583-4b48-8c0c-6361b6d336ab",
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "from collections import Counter\n",
    "import random\n",
    "df = pd.read_json(\"ScaleQuest-Math.json\", lines=True)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "18404493-f247-4752-a319-3ccfbdd19e28",
   "metadata": {},
   "source": [
    "import random\n",
    "def generate_non_consecutive_numbers(X, n):\n",
    "    max_possible = (X + 1) // 2 + (X % 2)\n",
    "    if n > max_possible:\n",
    "        raise ValueError(f\"Cannot generate {n} non-consecutive numbers in range [0, {X}]\")\n",
    "    while True:\n",
    "        candidates = list(range(X + 1))\n",
    "        random.shuffle(candidates)\n",
    "        result = []\n",
    "        banned = set()\n",
    "        for num in candidates:\n",
    "            if num in banned:\n",
    "                continue\n",
    "            result.append(num)\n",
    "            banned.update({num - 1, num + 1})\n",
    "            if len(result) == n:\n",
    "                return sorted(result)\n",
    "def data_process(idx):\n",
    "    data = df.iloc[idx]['response'].split(\"\\n\\n\")\n",
    "    # 20% num_to_remove = 0\n",
    "    if random.random() < 0.2:\n",
    "        num_to_remove = 0\n",
    "    else:\n",
    "        if len(data) <= 10:\n",
    "            num_to_remove = random.randint(1, 2)  # remove 1-2 steps\n",
    "        else:\n",
    "            num_to_remove = random.randint(1, 3)  # remove 1-3 steps\n",
    "    \n",
    "    removed_steps = []\n",
    "    removed_items = []\n",
    "    if num_to_remove > 0:\n",
    "        removed_steps = generate_non_consecutive_numbers(len(data) - 3, num_to_remove)\n",
    "    remain_data = [data[i] for i in range(len(data)) if i not in removed_steps]\n",
    "    removed_items = [data[i] for i in range(len(data)) if i in removed_steps]\n",
    "    result = \"\\n\".join([f\"step{i+1}:\\n{item}\" for i, item in enumerate(remain_data)])\n",
    "    \n",
    "    sy = \"\"\"You are a mathematics teacher reviewing a solution that may be missing one or more steps.\n",
    "Your task is to:\n",
    "1. Identify all points in the logical flow where a step is missing. For each missing step, specify exactly between which two consecutive steps it should be placed.\n",
    "2. Provide the complete missing step(s) with necessary explanations and equations.\n",
    "The solution may be missing multiple steps or might be complete. The steps in the solution are labeled from Step 0 (problem statement) to Step N.\n",
    "Please format your response as follows:\n",
    "For each missing step, output:\n",
    "Missing Step X:\n",
    "The missing step should be placed between Step Y and Step Y+1.\n",
    "The missing step is:\n",
    "[Write the complete missing step here with necessary explanations and equations]\n",
    "If there are no missing steps, please output:\n",
    "No missing steps.\n",
    "\"\"\"\n",
    "    \n",
    "    prompt = f\"\"\"<incomplete_solution>\n",
    "Step 0:\n",
    "{df.iloc[idx]['query']}\n",
    "{result}\n",
    "</incomplete_solution>\n",
    "\"\"\"\n",
    "    assist = \"\"\n",
    "    if removed_steps:\n",
    "        for i, (removed_step, removed_item) in enumerate(zip(removed_steps, removed_items), start=1):\n",
    "            assist += f\"Missing Step {i}：\\nThe missing step should be placed between Step {removed_step-i+1} and Step {removed_step-i+2}.\\n\"\n",
    "            assist += f\"The missing step is:\\n{removed_item}\\n\"\n",
    "    else:\n",
    "        assist = \"No missing steps.\"\n",
    "\n",
    "    m = {\n",
    "        \"messages\": [\n",
    "          {\n",
    "            \"role\": \"system\",\n",
    "            \"content\": sy\n",
    "          },\n",
    "          {\n",
    "            \"role\": \"user\",\n",
    "            \"content\": prompt\n",
    "          },\n",
    "          {\n",
    "            \"role\": \"assistant\",\n",
    "            \"content\": assist\n",
    "          }\n",
    "        ]\n",
    "      }\n",
    "    return m"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "dfab2c9b-ffb9-46e1-99e0-cffa20686420",
   "metadata": {},
   "source": [
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "process_data = []\n",
    "\n",
    "for i in tqdm(range(len(df))):\n",
    "    data = df.iloc[i]['response'].split(\"\\n\\n\")\n",
    "    if len(data) >= 6:\n",
    "        process_data.append(data_process(i))\n",
    "\n",
    "with open('ScaleQM+_total.json', 'w') as f:\n",
    "    json.dump(process_data, f, ensure_ascii=False, indent=4)\n",
    "\n",
    "print(\"process_data has been saved to JSON files.\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "d2d4efe8-e75b-4f17-98ff-fecdff9adb29",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
