{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DatasetDict({\n",
      "    train: Dataset({\n",
      "        features: ['problem', 'level', 'type', 'solution'],\n",
      "        num_rows: 7500\n",
      "    })\n",
      "    test: Dataset({\n",
      "        features: ['problem', 'level', 'type', 'solution'],\n",
      "        num_rows: 5000\n",
      "    })\n",
      "})\n",
      "Error extracting difficulty from Level ?\n",
      "Error extracting difficulty from Level ?\n"
     ]
    },
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'math.json'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[16], line 30\u001b[0m\n\u001b[1;32m     22\u001b[0m     new_entry \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m     23\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproblem\u001b[39m\u001b[38;5;124m\"\u001b[39m: entry[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mproblem\u001b[39m\u001b[38;5;124m'\u001b[39m],\n\u001b[1;32m     24\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124manswer\u001b[39m\u001b[38;5;124m\"\u001b[39m: answer,\n\u001b[1;32m     25\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdifficulty\u001b[39m\u001b[38;5;124m\"\u001b[39m: extract_difficulty(entry[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlevel\u001b[39m\u001b[38;5;124m\"\u001b[39m]),\n\u001b[1;32m     26\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m: entry[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtype\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m     27\u001b[0m     }\n\u001b[1;32m     28\u001b[0m     dataset\u001b[38;5;241m.\u001b[39mappend(new_entry)\n\u001b[0;32m---> 30\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmath.json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mw+\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m     31\u001b[0m     json\u001b[38;5;241m.\u001b[39mdump(dataset, f, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m4\u001b[39m)\n",
      "File \u001b[0;32m/opt/conda/envs/tsj/lib/python3.11/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m    317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m    318\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    319\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    320\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    321\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    322\u001b[0m     )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'math.json'"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "from datasets import load_dataset\n",
    "\n",
    "from rllm.rewards.math_utils import extract_answer\n",
    "\n",
    "ds = load_dataset(\"hendrycks/competition_math\")\n",
    "\n",
    "print(ds)\n",
    "\n",
    "\n",
    "def extract_difficulty(level_str):\n",
    "    # Extract the number from the string and convert it to a float\n",
    "    try:\n",
    "        difficulty = float(level_str.split()[-1])\n",
    "    except (ValueError, IndexError):\n",
    "        print(f\"Error extracting difficulty from {level_str}\")\n",
    "        difficulty = 0.0\n",
    "    return difficulty\n",
    "\n",
    "\n",
    "dataset = []\n",
    "for entry in ds[\"train\"]:\n",
    "    answer = extract_answer(entry[\"solution\"])\n",
    "    new_entry = {\"problem\": entry[\"problem\"], \"answer\": answer, \"difficulty\": extract_difficulty(entry[\"level\"]), \"type\": entry[\"type\"]}\n",
    "    dataset.append(new_entry)\n",
    "\n",
    "with open(\"math.json\", \"w\") as f:\n",
    "    json.dump(dataset, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset({\n",
      "    features: ['problem', 'solution', 'answer', 'subject', 'level', 'unique_id'],\n",
      "    num_rows: 500\n",
      "})\n"
     ]
    }
   ],
   "source": [
    "test_dataset = load_dataset(\"HuggingFaceH4/MATH-500\", trust_remote_code=True)[\"test\"]\n",
    "print(test_dataset)\n",
    "dataset = []\n",
    "for entry in test_dataset:\n",
    "    answer = extract_answer(entry[\"solution\"])\n",
    "    new_entry = {\"problem\": entry[\"problem\"], \"answer\": answer, \"difficulty\": float(entry[\"level\"]), \"unique_id\": entry[\"unique_id\"]}\n",
    "    dataset.append(new_entry)\n",
    "\n",
    "with open(\"math500.json\", \"w\") as f:\n",
    "    json.dump(dataset, f, indent=4)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "tsj",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
