{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "V2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 28271/28271 [00:03<00:00, 7482.23 examples/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 29/29 [00:00<00:00, 228.14ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "21998956"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "import os\n",
    "import datasets\n",
    "\n",
    "from verl.utils.hdfs_io import copy, makedirs\n",
    "import argparse\n",
    "\n",
    "base_prompt = \"\"\"\n",
    "Complete the following Lean 4 code:\n",
    "\n",
    "```lean4\n",
    "{}\n",
    "```\n",
    "\n",
    "Before producing the Lean 4 code to formally prove the given theorem, provide a detailed proof plan outlining the main proof steps and strategies.\n",
    "The plan should highlight key ideas, intermediate lemmas, and proof structures that will guide the construction of the final formal proof.\n",
    "\"\"\".strip()\n",
    "\n",
    "\n",
    "policy_prompt_template_deepseek = \"\"\"\n",
    "import Mathlib\n",
    "import Aesop\n",
    "\n",
    "set_option maxHeartbeats 0\n",
    "\n",
    "open BigOperators Real Nat Topology Rat\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "def process_fn(example, idx):\n",
    "    theorem_statement = example.pop('theorem_statement')\n",
    "    informal_statement = example.pop('informal_statement')\n",
    "    policy_prompt_template = policy_prompt_template_deepseek + informal_statement + theorem_statement\n",
    "    prompt = base_prompt.format(policy_prompt_template)\n",
    "\n",
    "    data = {\n",
    "        \"data_source\": 'mff-lwb-goedel-28k',\n",
    "        \"prompt\": [{\n",
    "            \"role\": \"user\",\n",
    "            \"content\": prompt\n",
    "        }],\n",
    "        \"ability\": \"math\",\n",
    "        \"reward_model\": {\n",
    "            \"style\": \"rule\",\n",
    "            \"ground_truth\": \"\"\n",
    "        },\n",
    "        \"extra_info\": {\n",
    "            'split': 'train',\n",
    "            'index': idx\n",
    "        }\n",
    "    }\n",
    "    return data\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "import datasets\n",
    "name1 = '/beegfs/scratch/user/<anonymized>/fcdpg-verl/verl/data/'\n",
    "name2 = 'mff-lwb-goedel-28k.parquet'\n",
    "ds = datasets.load_dataset('parquet', data_files={'train':name1+name2})\n",
    "train_dataset = ds.map(function=process_fn, with_indices=True)\n",
    "train_dataset['train'].to_parquet(name1 + 'processed/' + name2)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# V1.5 cot"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Map: 100%|██████████| 244/244 [00:00<00:00, 5620.82 examples/s]\n",
      "Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 504.43ba/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "170376"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import datasets\n",
    "\n",
    "\n",
    "base_prompt = \"\"\"\n",
    "Complete the following Lean 4 code with explanatory comments preceding each line of code:\n",
    "\n",
    "```lean4\n",
    "{}\n",
    "\n",
    "\"\"\".strip()\n",
    "\n",
    "\n",
    "policy_prompt_template_deepseek = \"\"\"\n",
    "import Mathlib\n",
    "import Aesop\n",
    "\n",
    "set_option maxHeartbeats 0\n",
    "\n",
    "open BigOperators Real Nat Topology Rat\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "def process_fn(example, idx):\n",
    "    theorem_statement = example.pop('theorem_statement')\n",
    "    informal_statement = example.pop('informal_statement')\n",
    "    policy_prompt_template = policy_prompt_template_deepseek +'\\n'+ informal_statement +'\\n'+ theorem_statement\n",
    "    prompt = base_prompt.format(policy_prompt_template)\n",
    "\n",
    "    data = {\n",
    "        \"data_source\": 'minif2f_test',\n",
    "        \"prompt\": [{\n",
    "            \"role\": \"user\",\n",
    "            \"content\": prompt\n",
    "        }],\n",
    "        \"ability\": \"math\",\n",
    "        \"reward_model\": {\n",
    "            \"style\": \"rule\",\n",
    "            \"ground_truth\": \"\"\n",
    "        },\n",
    "        \"extra_info\": {\n",
    "            'split': 'test',\n",
    "            'index': idx\n",
    "        }\n",
    "    }\n",
    "    return data\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "import datasets\n",
    "name1 = '/beegfs/scratch/user/<anonymized>/fcdpg-verl/verl/data/'\n",
    "name2 = 'minif2f_test.parquet'\n",
    "ds = datasets.load_dataset('parquet', data_files={'train':name1+name2})\n",
    "train_dataset = ds.map(function=process_fn, with_indices=True)\n",
    "train_dataset['train'].to_parquet(name1 + 'processed_cot/' + name2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "\n",
    "\n",
    "base_prompt = \"\"\"\n",
    "Complete the following Lean 4 code with explanatory comments preceding each line of code:\n",
    "\n",
    "```lean4\n",
    "{}\n",
    "\n",
    "\"\"\".strip()\n",
    "\n",
    "\n",
    "policy_prompt_template_deepseek = \"\"\"\n",
    "import Mathlib\n",
    "import Aesop\n",
    "\n",
    "set_option maxHeartbeats 0\n",
    "\n",
    "open BigOperators Real Nat Topology Rat\n",
    "\"\"\"\n",
    "\n",
    "\n",
    "def process_fn(example, idx):\n",
    "    theorem_statement = example.pop('theorem_statement')\n",
    "    informal_statement = example.pop('informal_statement')\n",
    "    policy_prompt_template = policy_prompt_template_deepseek +'\\n'+ informal_statement +'\\n'+ theorem_statement\n",
    "    prompt = base_prompt.format(policy_prompt_template)\n",
    "\n",
    "    data = {\n",
    "        \"data_source\": 'minif2f_test',\n",
    "        \"prompt\": [{\n",
    "            \"role\": \"user\",\n",
    "            \"content\": prompt\n",
    "        }],\n",
    "        \"ability\": \"math\",\n",
    "        \"reward_model\": {\n",
    "            \"style\": \"rule\",\n",
    "            \"ground_truth\": \"\"\n",
    "        },\n",
    "        \"extra_info\": {\n",
    "            'split': 'test',\n",
    "            'index': idx\n",
    "        }\n",
    "    }\n",
    "    return data\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "import datasets\n",
    "name1 = '/beegfs/scratch/user/<anonymized>/fcdpg-verl/verl/data/'\n",
    "name2 = 'minif2f_test.parquet'\n",
    "ds = datasets.load_dataset('parquet', data_files={'train':name1+name2})\n",
    "train_dataset = ds.map(function=process_fn, with_indices=True)\n",
    "train_dataset['train'].to_parquet(name1 + 'processed_cot/' + name2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'content': 'Complete the following Lean 4 code with explanatory comments preceding each line of code:\\n\\n```lean4\\n\\nimport Mathlib\\nimport Aesop\\n\\nset_option maxHeartbeats 0\\n\\nopen BigOperators Real Nat Topology Rat\\n\\n/-- The volume of a cone is given by the formula $V = \\\\frac{1}{3}Bh$, where $B$ is the area of the base and $h$ is the height. The area of the base of a cone is 30 square units, and its height is 6.5 units. What is the number of cubic units in its volume? Show that it is 65.-/\\n\\ntheorem mathd_algebra_478 (b h v : ℝ) (h₀ : 0 < b ∧ 0 < h ∧ 0 < v) (h₁ : v = 1 / 3 * (b * h))\\n    (h₂ : b = 30) (h₃ : h = 13 / 2) : v = 65 := by\\n', 'role': 'user'}]\n"
     ]
    }
   ],
   "source": [
    "print(train_dataset['train'][0]['prompt'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "verl2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
