{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d14ac934",
   "metadata": {},
   "outputs": [],
   "source": [
    "LCB_SYSTEM_MESSAGE_GENERIC = \"You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests.\"\n",
    "\n",
    "LCB_FORMATTING_MESSAGE_WITH_STARTER_CODE = \"You will use the following starter code to write the solution to the problem and enclose your code within delimiters.\"\n",
    "\n",
    "LCB_FORMATTING_WITHOUT_STARTER_CODE = \"Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows. Ensure that when the python program runs, it reads the inputs, runs the algorithm and writes output to STDOUT.\"\n",
    "\n",
    "# modified from https://github.com/LiveCodeBench/LiveCodeBench/blob/main/lcb_runner/prompts/code_generation.py\n",
    "def fetch_live_code_bench_system_prompt(prompt: str, starter_code: str | None = None):\n",
    "    prompt = LCB_SYSTEM_MESSAGE_GENERIC + \"\\n\\n\" + prompt\n",
    "    if starter_code:\n",
    "        prompt += f\"### Format: {LCB_FORMATTING_MESSAGE_WITH_STARTER_CODE}\\n\"\n",
    "        prompt += f\"```python\\n{starter_code}\\n```\\n\\n\"\n",
    "    else:\n",
    "        prompt += f\"### Format: {LCB_FORMATTING_WITHOUT_STARTER_CODE}\\n\"\n",
    "        prompt += \"```python\\n# YOUR CODE HERE\\n```\\n\\n\"\n",
    "    prompt += \"Let's think step by step within <think> </think> tags followed by detailed steps and final code using the provided format with backticks.\\n\\n\"\n",
    "    return prompt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "233c3228",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from functools import partial\n",
    "import pandas as pd\n",
    "from datasets import concatenate_datasets, load_dataset\n",
    "\n",
    "\n",
    "def filter_tests(tests):\n",
    "    indexed = [(i, t) for i, t in enumerate(tests)]\n",
    "    indexed.sort(key=lambda it: (-len(str(it[1].get(\"input\"))), it[0]))\n",
    "    selected_tests = [t for _, t in indexed[:15]]\n",
    "    return selected_tests\n",
    "\n",
    "def preprocess_fn(example, train=True):\n",
    "    starter_code = example.get(\"starter_code\", \"\")\n",
    "    question = fetch_live_code_bench_system_prompt(example[\"problem\"], starter_code if starter_code else None).strip()\n",
    "    prompt = [{\"role\": \"user\", \"content\": question}]\n",
    "\n",
    "    tests_raw = example[\"tests\"]\n",
    "    # Handle different test formats\n",
    "    if isinstance(tests_raw, str):\n",
    "        tests = json.loads(tests_raw)\n",
    "    else:\n",
    "        tests = tests_raw\n",
    "    metadata = example.get(\"metadata\", {})\n",
    "\n",
    "    # Convert TACO format to standard format\n",
    "    if isinstance(tests, dict) and \"inputs\" in tests and \"outputs\" in tests:\n",
    "        normalized_tests = []\n",
    "        for input_val, output_val in zip(tests[\"inputs\"], tests[\"outputs\"], strict=False):\n",
    "            normalized_tests.append({\"input\": input_val, \"output\": output_val, \"testtype\": \"stdin_stdout\"})\n",
    "        tests = normalized_tests\n",
    "\n",
    "    # Ensure tests is always a list\n",
    "    if not isinstance(tests, list):\n",
    "        tests = [tests] if tests else []\n",
    "\n",
    "    for test in tests:\n",
    "        if test.get(\"testtype\") == \"functional\" and metadata.get(\"func_name\") is not None:\n",
    "            test[\"metadata\"] = {\"func_name\": str(metadata[\"func_name\"])}\n",
    "        else:\n",
    "            test[\"metadata\"] = {\"func_name\": None}\n",
    "\n",
    "    if train:\n",
    "        ## Keep max 15 scores based on deepcoder recommondation\n",
    "        tests = filter_tests(tests)\n",
    "\n",
    "    reward_model = {\"style\": \"rule\", \"ground_truth\": tests}\n",
    "\n",
    "    return {\n",
    "        \"prompt\": prompt, \n",
    "        \"reward_model\": json.dumps(reward_model), \n",
    "        \"data_source\": \"livecodebench\",\n",
    "        \"ability\": \"code\",\n",
    "        \"extra_info\": {\"index\": None}\n",
    "    }\n",
    "\n",
    "def convert_to_pandas(dataset):\n",
    "    if hasattr(dataset, \"to_pandas\") and callable(dataset.to_pandas):\n",
    "        data_df = dataset.to_pandas()\n",
    "    else:\n",
    "        data_df = pd.DataFrame(dataset)\n",
    "    return data_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f399643e",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def _drop_present_columns(ds):\n",
    "    drop_cols = [\"problem\", \"solutions\", \"tests\", \"metadata\", \"starter_code\"]\n",
    "\n",
    "    present = [c for c in drop_cols if c in ds.column_names]\n",
    "    return ds.remove_columns(present) if present else ds\n",
    "\n",
    "def prepare_deepcoder_data(train_size: int = None, test_size: int = None):\n",
    "    train_dataset = concatenate_datasets([\n",
    "        load_dataset(\"agentica-org/DeepCoder-Preview-Dataset\", name=\"primeintellect\", split=\"train\"),\n",
    "        load_dataset(\"agentica-org/DeepCoder-Preview-Dataset\", name=\"taco\", split=\"train\"),\n",
    "        load_dataset(\"agentica-org/DeepCoder-Preview-Dataset\", name=\"lcbv5\", split=\"train\")\n",
    "    ])\n",
    "    test_dataset = concatenate_datasets([\n",
    "        load_dataset(\"agentica-org/DeepCoder-Preview-Dataset\", name=\"codeforces\", split=\"test\"),\n",
    "    ])\n",
    "\n",
    "    if train_size:\n",
    "        train_dataset = train_dataset.select(range(min(train_size, len(train_dataset))))\n",
    "    if test_size:\n",
    "        test_dataset = test_dataset.select(range(min(test_size, len(test_dataset))))\n",
    "\n",
    "    train_dataset = train_dataset.map(partial(preprocess_fn, train=True), writer_batch_size=10, num_proc=16)\n",
    "    test_dataset = test_dataset.map(partial(preprocess_fn, train=False), writer_batch_size=10, num_proc=16)\n",
    "\n",
    "    train_dataset = _drop_present_columns(train_dataset)\n",
    "    test_dataset = _drop_present_columns(test_dataset)\n",
    "\n",
    "    return convert_to_pandas(train_dataset), convert_to_pandas(test_dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f322f775",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset, test_dataset = prepare_deepcoder_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a16a977e",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset.to_parquet(\"data/train_deepcoder.parquet\", index=False)\n",
    "test_dataset.sample(10).to_parquet(\"data/test_deepcoder_10.parquet\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71741ee4",
   "metadata": {},
   "source": [
    "## Testing dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2f785a05",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def preprocess_test_fn(example, data_source):\n",
    "    starter_code = example.get(\"starter_code\", \"\")\n",
    "    question = fetch_live_code_bench_system_prompt(example[\"problem\"], starter_code if starter_code else None).strip()\n",
    "    prompt = [{\"role\": \"user\", \"content\": question}]\n",
    "\n",
    "    tests_raw = example[\"tests\"]\n",
    "    # Handle different test formats\n",
    "    if isinstance(tests_raw, str):\n",
    "        tests = json.loads(tests_raw)\n",
    "    else:\n",
    "        tests = tests_raw\n",
    "    metadata = example.get(\"metadata\", {})\n",
    "\n",
    "    # Convert TACO format to standard format\n",
    "    if isinstance(tests, dict) and \"inputs\" in tests and \"outputs\" in tests:\n",
    "        normalized_tests = []\n",
    "        for input_val, output_val in zip(tests[\"inputs\"], tests[\"outputs\"], strict=False):\n",
    "            normalized_tests.append({\"input\": input_val, \"output\": output_val, \"testtype\": \"stdin_stdout\"})\n",
    "        tests = normalized_tests\n",
    "\n",
    "    # Ensure tests is always a list\n",
    "    if not isinstance(tests, list):\n",
    "        tests = [tests] if tests else []\n",
    "\n",
    "    for test in tests:\n",
    "        if test.get(\"testtype\") == \"functional\" and metadata.get(\"func_name\") is not None:\n",
    "            test[\"metadata\"] = {\"func_name\": str(metadata[\"func_name\"])}\n",
    "        else:\n",
    "            test[\"metadata\"] = {\"func_name\": None}\n",
    "\n",
    "    reward_model = {\"style\": \"rule\", \"ground_truth\": tests}\n",
    "\n",
    "    return {\n",
    "        \"prompt\": prompt, \n",
    "        \"reward_model\": json.dumps(reward_model), \n",
    "        \"data_source\": data_source,\n",
    "        \"ability\": \"code\",\n",
    "        \"extra_info\": {\"index\": None}\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e2c11ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "codeforces_dataset = load_dataset(\"agentica-org/DeepCoder-Preview-Dataset\", name=\"codeforces\", split=\"test\")\n",
    "lcbv5_dataset = load_dataset(\"agentica-org/DeepCoder-Preview-Dataset\", name=\"lcbv5\", split=\"test\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "225050c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "codeforces_dataset = codeforces_dataset.map(partial(preprocess_test_fn, data_source=\"codeforces\"), writer_batch_size=10, num_proc=16)\n",
    "codeforces_dataset = convert_to_pandas(_drop_present_columns(codeforces_dataset))\n",
    "lcbv5_dataset = lcbv5_dataset.map(partial(preprocess_test_fn, data_source=\"livecodebench\"), writer_batch_size=10, num_proc=16)\n",
    "lcbv5_dataset = convert_to_pandas(_drop_present_columns(lcbv5_dataset))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e72cae9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "codeforces_dataset.to_parquet(\"../datasets/test_codeforces.parquet\", index=False)\n",
    "lcbv5_dataset.to_parquet(\"../datasets/test_lcbv5.parquet\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef51a4be",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "verl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
