{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "358d27e2",
   "metadata": {},
   "source": [
    "## Split and annotate the reasoning traces"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dfa7add7",
   "metadata": {},
   "source": [
    "This notebook implements two algorithms:\n",
    "- **Step-wise Generation:** Given reasoning traces from models, we split them into steps following the definition we explicited in our paper.\n",
    "- **Step Labelling:** Using our ReasonType taxonomy, we prompted GPT-4o-mini to attribute a step-type to each steps resulting from our *Step-wise Generation* algorithm."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d0f8c7e1",
   "metadata": {},
   "source": [
    "### A. Step-wise Generation"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a9444cec",
   "metadata": {},
   "source": [
    "The function ```merge_steps_by_min_tokens``` takes a reasoning trace, the minimal number of tokens $k$, and a tokenizer. It returns a list of steps."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a7a1fcf",
   "metadata": {},
   "outputs": [],
   "source": [
    "def merge_steps_by_min_tokens(raw_answer, k, tokenizer):\n",
    "    \"\"\"\n",
    "    Merge steps so that each merged step has at least k tokens.\n",
    "    (except the last one which may be smaller)\n",
    "    \n",
    "    Inputs:\n",
    "        - raw_answer (str): The full text from the model.\n",
    "        - k (int): Minimum number of tokens required per merged step.\n",
    "        - tokenizer: A tokenizer.\n",
    "    \n",
    "    Returns:\n",
    "        list[str]: A list of merged steps.\n",
    "    \"\"\"\n",
    "\n",
    "    merged_steps = []\n",
    "    buffer = \"\"\n",
    "    buffer_tokens = 0\n",
    "\n",
    "    steps_raw = raw_answer.split('.\\n\\n')\n",
    "\n",
    "    for step in steps_raw:\n",
    "\n",
    "        step_tokens = len(tokenizer(step)['input_ids'])\n",
    "\n",
    "        buffer += (\".\\n\\n\" if buffer else \"\") + step\n",
    "        buffer_tokens += step_tokens\n",
    "\n",
    "        if buffer_tokens >= k:\n",
    "            merged_steps.append(buffer)\n",
    "            buffer = \"\"\n",
    "            buffer_tokens = 0\n",
    "\n",
    "    if buffer:\n",
    "        merged_steps.append(buffer)\n",
    "\n",
    "    return merged_steps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1a4b7fa6",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "model_path = \"your_tokenizer\" # Add your tokenizer\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_path)\n",
    "\n",
    "df_traces = pd.read_json('path_to_your_reasoning_traces.json') # Add reasoning traces\n",
    "\n",
    "k = 30 # Add your value of k\n",
    "\n",
    "results = []\n",
    "\n",
    "for i in tqdm(range(len(df_traces))):\n",
    "    \n",
    "    answer_full = df_traces.iloc[i]['answer']\n",
    "\n",
    "    # Split and filter steps\n",
    "    steps_raw = answer_full.split('.\\n\\n')\n",
    "    steps = merge_steps_by_min_tokens(steps_raw, k=k, tokenizer=tokenizer)\n",
    "    n = len(steps)\n",
    "\n",
    "    for j in range(n):\n",
    "\n",
    "        results.append({\n",
    "            \"sample_id\":i,\n",
    "            \"step_id\":j,\n",
    "            \"step\": steps[j]\n",
    "        })\n",
    "\n",
    "df_steps = pd.DataFrame(results)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d6d0e56c",
   "metadata": {},
   "source": [
    "### B. Step-Labelling"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "76e17647",
   "metadata": {},
   "source": [
    "Once the reasoning traces are split into steps, we can prompt a model to annotate them."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c9b3f2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "import time\n",
    "import pandas as pd\n",
    "\n",
    "api_key = \"you_openai_key\"\n",
    "\n",
    "client = OpenAI(api_key=api_key)\n",
    "\n",
    "def annotate_step(step):\n",
    "\n",
    "    response = client.responses.create(\n",
    "                model=\"gpt-4o-mini\",\n",
    "                input=[\n",
    "                    {\"role\": \"system\", \"content\": \"Classify the following reasoning step into one of the categories defined.\"},\n",
    "                    {\"role\": \"user\", \"content\": step}\n",
    "                ],\n",
    "                text={\n",
    "                    \"format\": {\n",
    "                        \"type\": \"json_schema\",\n",
    "                        \"name\": \"reasoning_step_annotation\",\n",
    "                        \"schema\": {\n",
    "                            \"type\": \"object\",\n",
    "                            \"properties\": {\n",
    "                                \"category\": {\n",
    "                                    \"type\": \"string\",\n",
    "                                    \"enum\": [\n",
    "                                        \"Problem Re-statement / Setup\",\n",
    "                                        \"Definition Recall\",\n",
    "                                        \"Formula Substitution / Plugging In\",\n",
    "                                        \"Quadrant/Edge Case Consideration\",\n",
    "                                        \"Symbolic Transformation / Rewriting Sums\",\n",
    "                                        \"Pattern Recognition / Symmetry\",\n",
    "                                        \"Verification / Sanity Check\",\n",
    "                                        \"Heuristics / Intuition\",\n",
    "                                        \"Alternative Approach Exploration\",\n",
    "                                        \"Numerical Approximation / Interpretation\",\n",
    "                                        \"Final Conclusion / Boxed Answer\",\n",
    "                                        \"Meta-Cognition / Self-Talk\",\n",
    "                                        \"Context Repetition / Paraphrasing\",\n",
    "                                        \"Other - please, specify\"\n",
    "                                    ]\n",
    "                                }\n",
    "                            },\n",
    "                            \"required\": [\"category\"],\n",
    "                            \"additionalProperties\": False\n",
    "                        },\n",
    "                        \"strict\": True\n",
    "                    }\n",
    "                })\n",
    "    \n",
    "    return response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f76b4b3f",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = []\n",
    "\n",
    "for i in tqdm(range(len(df_steps))):\n",
    "\n",
    "    step = df_steps.iloc[i][\"step\"]\n",
    "    sample_id = df_steps.iloc[i][\"sample_id\"]\n",
    "    step_id = df_steps.iloc[i][\"step_id\"]\n",
    "    \n",
    "    start_time = time.time()\n",
    "\n",
    "    category = annotate_step(step)\n",
    "    category = json.loads(category.output_text)['category']\n",
    "\n",
    "    end_time = time.time()\n",
    "    runtime_seconds = end_time - start_time\n",
    "\n",
    "    results.append({\"sample_id\": sample_id, \"step_id\":step_id, \"step\":step, \"category\": category, \"runtime\": runtime_seconds})\n",
    "\n",
    "    df_results = pd.DataFrame(results)\n",
    "    df_results.to_json(\"labelled_traces.json\", orient=\"records\", indent=4)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
