{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "from tqdm import tqdm "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 3914 problem descriptions.\n"
     ]
    }
   ],
   "source": [
    "problem_description_path = \"/home/tongye/code_generation/tong-code-optimization/dataset_benchmark/problem_all.json\"\n",
    "with open(problem_description_path, 'r') as f:\n",
    "    problem_descriptions = json.load(f)\n",
    "\n",
    "print(f\"There are {len(problem_descriptions)} problem descriptions.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "There are 1422 data points.\n"
     ]
    }
   ],
   "source": [
    "train_dataset_path = \"/data3/tydata3/code_optimization/cpp/dataset/by_user/test_out_pair_in_original.json\"\n",
    "with open(train_dataset_path, 'r') as g:\n",
    "    train_dataset = json.load(g)\n",
    "\n",
    "print(f\"There are {len(train_dataset)} data points.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3914/3914 [00:00<00:00, 514559.49it/s]\n"
     ]
    }
   ],
   "source": [
    "# construct\n",
    "# sure \"problem_id\", \"problem_description\", \"input_description\", \"output_description\", \"constraints\", \"sample\"\n",
    "problems = {}\n",
    "for each_problem_description in tqdm(problem_descriptions):\n",
    "    problem_id = each_problem_description[\"problem_id\"]\n",
    "    problem_description = each_problem_description[\"problem_description\"]\n",
    "    input_description = each_problem_description[\"input_description\"]\n",
    "    output_description = each_problem_description[\"output_description\"]\n",
    "    constraints = each_problem_description[\"constraints\"]\n",
    "\n",
    "    problems[problem_id] = {\n",
    "        \"problem_description\" : problem_description,\n",
    "        \"input_description\": input_description,\n",
    "        \"output_description\": output_description, \n",
    "        \"constraints\": constraints\n",
    "    }\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1422/1422 [00:00<00:00, 92262.36it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "average length = 145.37130801687763\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "lengths = []\n",
    "for each_train_point in tqdm(train_dataset):\n",
    "    if each_train_point[\"problem_id\"] not in problems:\n",
    "        print(each_train_point[\"problem_id\"])\n",
    "    \n",
    "    problem_description = problems[each_train_point[\"problem_id\"]][\"problem_description\"]\n",
    "    input_description = problems[each_train_point[\"problem_id\"]][\"input_description\"]\n",
    "    output_description = problems[each_train_point[\"problem_id\"]][\"output_description\"]\n",
    "    constraints = problems[each_train_point[\"problem_id\"]][\"constraints\"]\n",
    "\n",
    "    problem_final_description = f\"##problem description: {problem_description}\\n##input description: {input_description}\\n##output description: {output_description}\\n##constraints: {constraints}\"\n",
    "\n",
    "    each_train_point[\"problem_description\"] = problem_final_description\n",
    "    lengths.append(len(problem_final_description.split()))\n",
    "    # assert False\n",
    "print(f\"average length = {sum(lengths)/len(lengths)}\")\n",
    "\n",
    "with open(\"/data3/tydata3/code_optimization/cpp/dataset/by_user/test_out_pair_in_original_description.json\", 'w') as p:\n",
    "    json.dump(train_dataset, p, indent=4)\n",
    "    "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llama",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
