{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from datasets import (\n",
    "    load_from_disk,\n",
    "    Dataset,\n",
    "    DatasetDict,\n",
    "    concatenate_datasets,\n",
    "    load_dataset,\n",
    ")\n",
    "\n",
    "# Load\n",
    "pds = concatenate_datasets(\n",
    "    [\n",
    "        load_from_disk(f\"./data/Codeforces/chunks/chunk_{chunk_index}\")\n",
    "        for chunk_index in range(16)\n",
    "    ]\n",
    ")\n",
    "pdf = pds.to_pandas()\n",
    "pdf = pdf[pdf[\"source_0\"].apply(lambda x: len(x) > 0)]\n",
    "pdf = pdf[pdf[\"source_1\"].apply(lambda x: len(x) > 0)]\n",
    "pdf = pdf[pdf[\"source_2\"].apply(lambda x: len(x) > 0)]\n",
    "assert not pdf.isna().any().any()\n",
    "\n",
    "# Tag\n",
    "pdf[\"tag\"] = pdf[\"tag\"].replace(\n",
    "    {\n",
    "        \"dp\": \"DP\",\n",
    "        \"math\": \"Math\",\n",
    "        \"greedy\": \"Greedy\",\n",
    "        \"others\": \"Others\",\n",
    "        \"implementation\": \"Implement\",\n",
    "    }\n",
    ")\n",
    "\n",
    "# Interpolate rating\n",
    "interpolating_ratio = 0.9\n",
    "pdf[\"rating\"] = (\n",
    "    pdf[\"rating\"] * (1 - interpolating_ratio)\n",
    "    + pdf[\"reference_rating\"] * interpolating_ratio\n",
    ")\n",
    "\n",
    "# Normalize rating\n",
    "pdf[\"norm_rating\"] = (pdf[\"rating\"] - pdf[\"rating\"].min()) / (\n",
    "    pdf[\"rating\"].max() - pdf[\"rating\"].min()\n",
    ")\n",
    "pdf[\"norm_rating_deviation\"] = pdf[\"rating_deviation\"] / (\n",
    "    pdf[\"rating\"].max() - pdf[\"rating\"].min()\n",
    ")\n",
    "pdf[\"norm_rating_volatility\"] = pdf[\"rating_volatility\"] / (\n",
    "    pdf[\"rating\"].max() - pdf[\"rating\"].min()\n",
    ")\n",
    "pdf[\"rating_quantile\"] = pdf[\"norm_rating\"].rank(pct=True)\n",
    "\n",
    "# Get APPs col\n",
    "pdf[\"input_output\"] = pdf[[\"inputs\", \"answers\"]].apply(\n",
    "    lambda row: {\"inputs\": row[\"inputs\"], \"outputs\": row[\"answers\"]}, axis=1\n",
    ")\n",
    "\n",
    "# Check existence in APPs\n",
    "rdf = concatenate_datasets(\n",
    "    [\n",
    "        load_dataset(\"codeparrot/apps\", \"all\", cache_dir=\"./cache\", split=\"train\"),\n",
    "        load_dataset(\"codeparrot/apps\", \"all\", cache_dir=\"./cache\", split=\"test\"),\n",
    "    ]\n",
    ").to_pandas()\n",
    "rdf = rdf[rdf[\"url\"].str.startswith(\"https://codeforces.com/problemset/problem/\")]\n",
    "rdf_indices = (\n",
    "    rdf[\"url\"].apply(lambda x: (int(x.split(\"/\")[-2]), x.split(\"/\")[-1])).to_list()\n",
    ")\n",
    "pdf[\"ever_exist\"] = pdf.apply(\n",
    "    lambda x: (x[\"contestId\"], x[\"index\"]) in rdf_indices, axis=1\n",
    ")\n",
    "\n",
    "# Rename and reorder\n",
    "assert not pdf.isna().any().any()\n",
    "rename_dict = {\n",
    "    \"contestId\": \"contest_id\",\n",
    "    \"index\": \"problem_index\",\n",
    "    \"norm_rating\": \"rating\",\n",
    "    \"norm_rating_deviation\": \"rating_std\",\n",
    "    \"norm_rating_volatility\": \"rating_volatility\",\n",
    "    \"rating_quantile\": \"rating_quantile\",\n",
    "    \"tag\": \"tag\",\n",
    "    \"detailed_tag\": \"detailed_tag\",\n",
    "    \"name\": \"problem_name\",\n",
    "    \"problem_main\": \"problem_main\",\n",
    "    \"problem_note\": \"problem_note\",\n",
    "    \"input_spec\": \"input_spec\",\n",
    "    \"output_spec\": \"output_spec\",\n",
    "    \"sample_inputs\": \"sample_inputs\",\n",
    "    \"sample_outputs\": \"sample_outputs\",\n",
    "    \"inputs\": \"inputs\",\n",
    "    \"answers\": \"answers\",\n",
    "    \"input_output\": \"input_output\",\n",
    "    \"id_0\": \"solution_id_0\",\n",
    "    \"source_0\": \"solution_0\",\n",
    "    \"outputs_0\": \"outputs_0\",\n",
    "    \"id_1\": \"solution_id_1\",\n",
    "    \"source_1\": \"solution_1\",\n",
    "    \"outputs_1\": \"outputs_1\",\n",
    "    \"id_2\": \"solution_id_2\",\n",
    "    \"source_2\": \"solution_2\",\n",
    "    \"outputs_2\": \"outputs_2\",\n",
    "    \"rating\": \"unnorm_rating\",\n",
    "    \"rating_deviation\": \"unnorm_rating_std\",\n",
    "    \"rating_volatility\": \"unnorm_rating_volatility\",\n",
    "    \"reference_rating\": \"reference_rating\",\n",
    "    \"tags\": \"original_tags\",\n",
    "    \"ever_exist\": \"ever_exist\",\n",
    "}\n",
    "pdf = pdf[rename_dict.keys()]\n",
    "pdf.rename(columns=rename_dict, inplace=True)\n",
    "\n",
    "\n",
    "# Split\n",
    "test_size = 4000\n",
    "pdf[\"input_length\"] = pdf[\"inputs\"].apply(len)\n",
    "eval_indices = pdf.nlargest(test_size, \"input_length\").index\n",
    "pdf_eval, pdf_train = pdf.loc[eval_indices], pdf.drop(eval_indices)\n",
    "pdf_eval.drop(columns=[\"input_length\"], inplace=True)\n",
    "pdf_train.drop(columns=[\"input_length\"], inplace=True)\n",
    "\n",
    "\n",
    "DatasetDict(\n",
    "    {\n",
    "        \"train\": Dataset.from_pandas(pdf_train.reset_index(drop=True)),\n",
    "        \"eval\": Dataset.from_pandas(pdf_eval.reset_index(drop=True)),\n",
    "    }\n",
    ").save_to_disk(\"./prepub/Codeforces\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>contest_id</th>\n",
       "      <th>problem_index</th>\n",
       "      <th>rating</th>\n",
       "      <th>rating_std</th>\n",
       "      <th>rating_volatility</th>\n",
       "      <th>rating_quantile</th>\n",
       "      <th>tag</th>\n",
       "      <th>detailed_tag</th>\n",
       "      <th>problem_name</th>\n",
       "      <th>problem_main</th>\n",
       "      <th>problem_note</th>\n",
       "      <th>input_spec</th>\n",
       "      <th>output_spec</th>\n",
       "      <th>sample_inputs</th>\n",
       "      <th>sample_outputs</th>\n",
       "      <th>inputs</th>\n",
       "      <th>answers</th>\n",
       "      <th>input_output</th>\n",
       "      <th>solution_id_0</th>\n",
       "      <th>solution_0</th>\n",
       "      <th>outputs_0</th>\n",
       "      <th>solution_id_1</th>\n",
       "      <th>solution_1</th>\n",
       "      <th>outputs_1</th>\n",
       "      <th>solution_id_2</th>\n",
       "      <th>solution_2</th>\n",
       "      <th>outputs_2</th>\n",
       "      <th>unnorm_rating</th>\n",
       "      <th>unnorm_rating_std</th>\n",
       "      <th>unnorm_rating_volatility</th>\n",
       "      <th>reference_rating</th>\n",
       "      <th>original_tags</th>\n",
       "      <th>ever_exist</th>\n",
       "      <th>input_length</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>A</td>\n",
       "      <td>0.149277</td>\n",
       "      <td>0.027239</td>\n",
       "      <td>0.000021</td>\n",
       "      <td>0.208535</td>\n",
       "      <td>Math</td>\n",
       "      <td>math</td>\n",
       "      <td>Theatre Square</td>\n",
       "      <td>Theatre Square in the capital city of Berland ...</td>\n",
       "      <td></td>\n",
       "      <td>The input contains three positive integer numb...</td>\n",
       "      <td>Write the needed number of flagstones.</td>\n",
       "      <td>[6 6 4]</td>\n",
       "      <td>[4]</td>\n",
       "      <td>[6 6 4\\r\\n, 1 1 1\\r\\n, 2 1 1\\r\\n, 1 2 1\\r\\n, 2...</td>\n",
       "      <td>[4\\r\\n, 1\\r\\n, 2\\r\\n, 2\\r\\n, 4\\r\\n, 1\\r\\n, 1\\r...</td>\n",
       "      <td>{'inputs': ['6 6 4\r\n",
       "', '1 1 1\r\n",
       "', '2 1 1\r\n",
       "', '...</td>\n",
       "      <td>254282675</td>\n",
       "      <td>import math\\n\\ndef flagstones_needed(n, m, a):...</td>\n",
       "      <td>[4\\r\\n, 1\\r\\n, 2\\r\\n, 2\\r\\n, 4\\r\\n, 1\\r\\n, 1\\r...</td>\n",
       "      <td>152514708</td>\n",
       "      <td>import math\\n(x, y, z) = list(map(float, input...</td>\n",
       "      <td>[4\\r\\n, 1\\r\\n, 2\\r\\n, 2\\r\\n, 4\\r\\n, 1\\r\\n, 1\\r...</td>\n",
       "      <td>152424760</td>\n",
       "      <td>import math\\n(x, y, z) = list(map(float, input...</td>\n",
       "      <td>[4\\r\\n, 1\\r\\n, 2\\r\\n, 2\\r\\n, 4\\r\\n, 1\\r\\n, 1\\r...</td>\n",
       "      <td>1050.510591</td>\n",
       "      <td>78.430114</td>\n",
       "      <td>0.059986</td>\n",
       "      <td>1000.0</td>\n",
       "      <td>[math]</td>\n",
       "      <td>False</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>B</td>\n",
       "      <td>0.345153</td>\n",
       "      <td>0.025926</td>\n",
       "      <td>0.000021</td>\n",
       "      <td>0.538562</td>\n",
       "      <td>Implement</td>\n",
       "      <td>math</td>\n",
       "      <td>Spreadsheet</td>\n",
       "      <td>In the popular spreadsheets systems (for examp...</td>\n",
       "      <td></td>\n",
       "      <td>The first line of the input contains integer n...</td>\n",
       "      <td>Write n lines, each line should contain a cell...</td>\n",
       "      <td>[2\\nR23C55\\nBC23]</td>\n",
       "      <td>[BC23\\nR23C55]</td>\n",
       "      <td>[2\\r\\nR23C55\\r\\nBC23\\r\\n, 1\\r\\nA1\\r\\n, 5\\r\\nR8...</td>\n",
       "      <td>[BC23\\r\\nR23C55\\r\\n, R1C1\\r\\n, C8\\r\\nR1C4\\r\\nB...</td>\n",
       "      <td>{'inputs': ['2\r\n",
       "R23C55\r\n",
       "BC23\r\n",
       "', '1\r\n",
       "A1\r\n",
       "', '5...</td>\n",
       "      <td>213963824</td>\n",
       "      <td>import re\\nBASE = len('ABCDEFGHIJKLMNOPQRSTUVW...</td>\n",
       "      <td>[BC23\\r\\nR23C55\\r\\n, R1C1\\r\\n, C8\\r\\nR1C4\\r\\nB...</td>\n",
       "      <td>233814981</td>\n",
       "      <td>import re\\nBASE = len('ABCDEFGHIJKLMNOPQRSTUVW...</td>\n",
       "      <td>[BC23\\r\\nR23C55\\r\\n, R1C1\\r\\n, C8\\r\\nR1C4\\r\\nB...</td>\n",
       "      <td>216532197</td>\n",
       "      <td>def col_num_to_label(col_num):\\n    col_label ...</td>\n",
       "      <td>[BC23\\r\\nR23C55\\r\\n, R1C1\\r\\n, C8\\r\\nR1C4\\r\\nB...</td>\n",
       "      <td>1614.497246</td>\n",
       "      <td>74.647468</td>\n",
       "      <td>0.059976</td>\n",
       "      <td>1600.0</td>\n",
       "      <td>[implementation, math]</td>\n",
       "      <td>False</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>C</td>\n",
       "      <td>0.504523</td>\n",
       "      <td>0.026770</td>\n",
       "      <td>0.000021</td>\n",
       "      <td>0.803863</td>\n",
       "      <td>Math</td>\n",
       "      <td>geometry</td>\n",
       "      <td>Ancient Berland Circus</td>\n",
       "      <td>Nowadays all circuses in Berland have a round ...</td>\n",
       "      <td></td>\n",
       "      <td>The input file consists of three lines, each o...</td>\n",
       "      <td>Output the smallest possible area of the ancie...</td>\n",
       "      <td>[0.000000 0.000000\\n1.000000 1.000000\\n0.00000...</td>\n",
       "      <td>[1.00000000]</td>\n",
       "      <td>[0.000000 0.000000\\r\\n1.000000 1.000000\\r\\n0.0...</td>\n",
       "      <td>[1.00000000\\r\\n, 9991.27897663\\r\\n, 4268.87997...</td>\n",
       "      <td>{'inputs': ['0.000000 0.000000\r\n",
       "1.000000 1.000...</td>\n",
       "      <td>219229345</td>\n",
       "      <td>import math\\nPI = 3.141592654\\npoint = [[0.0] ...</td>\n",
       "      <td>[1.00000000\\r\\n, 9991.27878603\\r\\n, 4268.87997...</td>\n",
       "      <td>207685115</td>\n",
       "      <td>import math\\n\\ndef check(n):\\n    a = 2 * math...</td>\n",
       "      <td>[1.000000\\r\\n, 9991.278788\\r\\n, 4268.879975\\r\\...</td>\n",
       "      <td>3720438</td>\n",
       "      <td>__author__ = 'Eddie'\\n\\ndef get_filehandler(is...</td>\n",
       "      <td>[1.0000000000000002\\r\\n, 9991.278787608775\\r\\n...</td>\n",
       "      <td>2073.371451</td>\n",
       "      <td>77.078587</td>\n",
       "      <td>0.059990</td>\n",
       "      <td>2100.0</td>\n",
       "      <td>[geometry, math]</td>\n",
       "      <td>False</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   contest_id problem_index    rating  rating_std  rating_volatility  \\\n",
       "0           1             A  0.149277    0.027239           0.000021   \n",
       "1           1             B  0.345153    0.025926           0.000021   \n",
       "2           1             C  0.504523    0.026770           0.000021   \n",
       "\n",
       "   rating_quantile        tag detailed_tag            problem_name  \\\n",
       "0         0.208535       Math         math          Theatre Square   \n",
       "1         0.538562  Implement         math             Spreadsheet   \n",
       "2         0.803863       Math     geometry  Ancient Berland Circus   \n",
       "\n",
       "                                        problem_main problem_note  \\\n",
       "0  Theatre Square in the capital city of Berland ...                \n",
       "1  In the popular spreadsheets systems (for examp...                \n",
       "2  Nowadays all circuses in Berland have a round ...                \n",
       "\n",
       "                                          input_spec  \\\n",
       "0  The input contains three positive integer numb...   \n",
       "1  The first line of the input contains integer n...   \n",
       "2  The input file consists of three lines, each o...   \n",
       "\n",
       "                                         output_spec  \\\n",
       "0             Write the needed number of flagstones.   \n",
       "1  Write n lines, each line should contain a cell...   \n",
       "2  Output the smallest possible area of the ancie...   \n",
       "\n",
       "                                       sample_inputs  sample_outputs  \\\n",
       "0                                            [6 6 4]             [4]   \n",
       "1                                  [2\\nR23C55\\nBC23]  [BC23\\nR23C55]   \n",
       "2  [0.000000 0.000000\\n1.000000 1.000000\\n0.00000...    [1.00000000]   \n",
       "\n",
       "                                              inputs  \\\n",
       "0  [6 6 4\\r\\n, 1 1 1\\r\\n, 2 1 1\\r\\n, 1 2 1\\r\\n, 2...   \n",
       "1  [2\\r\\nR23C55\\r\\nBC23\\r\\n, 1\\r\\nA1\\r\\n, 5\\r\\nR8...   \n",
       "2  [0.000000 0.000000\\r\\n1.000000 1.000000\\r\\n0.0...   \n",
       "\n",
       "                                             answers  \\\n",
       "0  [4\\r\\n, 1\\r\\n, 2\\r\\n, 2\\r\\n, 4\\r\\n, 1\\r\\n, 1\\r...   \n",
       "1  [BC23\\r\\nR23C55\\r\\n, R1C1\\r\\n, C8\\r\\nR1C4\\r\\nB...   \n",
       "2  [1.00000000\\r\\n, 9991.27897663\\r\\n, 4268.87997...   \n",
       "\n",
       "                                        input_output  solution_id_0  \\\n",
       "0  {'inputs': ['6 6 4\n",
       "', '1 1 1\n",
       "', '2 1 1\n",
       "', '...      254282675   \n",
       "1  {'inputs': ['2\n",
       "R23C55\n",
       "BC23\n",
       "', '1\n",
       "A1\n",
       "', '5...      213963824   \n",
       "2  {'inputs': ['0.000000 0.000000\n",
       "1.000000 1.000...      219229345   \n",
       "\n",
       "                                          solution_0  \\\n",
       "0  import math\\n\\ndef flagstones_needed(n, m, a):...   \n",
       "1  import re\\nBASE = len('ABCDEFGHIJKLMNOPQRSTUVW...   \n",
       "2  import math\\nPI = 3.141592654\\npoint = [[0.0] ...   \n",
       "\n",
       "                                           outputs_0  solution_id_1  \\\n",
       "0  [4\\r\\n, 1\\r\\n, 2\\r\\n, 2\\r\\n, 4\\r\\n, 1\\r\\n, 1\\r...      152514708   \n",
       "1  [BC23\\r\\nR23C55\\r\\n, R1C1\\r\\n, C8\\r\\nR1C4\\r\\nB...      233814981   \n",
       "2  [1.00000000\\r\\n, 9991.27878603\\r\\n, 4268.87997...      207685115   \n",
       "\n",
       "                                          solution_1  \\\n",
       "0  import math\\n(x, y, z) = list(map(float, input...   \n",
       "1  import re\\nBASE = len('ABCDEFGHIJKLMNOPQRSTUVW...   \n",
       "2  import math\\n\\ndef check(n):\\n    a = 2 * math...   \n",
       "\n",
       "                                           outputs_1  solution_id_2  \\\n",
       "0  [4\\r\\n, 1\\r\\n, 2\\r\\n, 2\\r\\n, 4\\r\\n, 1\\r\\n, 1\\r...      152424760   \n",
       "1  [BC23\\r\\nR23C55\\r\\n, R1C1\\r\\n, C8\\r\\nR1C4\\r\\nB...      216532197   \n",
       "2  [1.000000\\r\\n, 9991.278788\\r\\n, 4268.879975\\r\\...        3720438   \n",
       "\n",
       "                                          solution_2  \\\n",
       "0  import math\\n(x, y, z) = list(map(float, input...   \n",
       "1  def col_num_to_label(col_num):\\n    col_label ...   \n",
       "2  __author__ = 'Eddie'\\n\\ndef get_filehandler(is...   \n",
       "\n",
       "                                           outputs_2  unnorm_rating  \\\n",
       "0  [4\\r\\n, 1\\r\\n, 2\\r\\n, 2\\r\\n, 4\\r\\n, 1\\r\\n, 1\\r...    1050.510591   \n",
       "1  [BC23\\r\\nR23C55\\r\\n, R1C1\\r\\n, C8\\r\\nR1C4\\r\\nB...    1614.497246   \n",
       "2  [1.0000000000000002\\r\\n, 9991.278787608775\\r\\n...    2073.371451   \n",
       "\n",
       "   unnorm_rating_std  unnorm_rating_volatility  reference_rating  \\\n",
       "0          78.430114                  0.059986            1000.0   \n",
       "1          74.647468                  0.059976            1600.0   \n",
       "2          77.078587                  0.059990            2100.0   \n",
       "\n",
       "            original_tags  ever_exist  input_length  \n",
       "0                  [math]       False            20  \n",
       "1  [implementation, math]       False             5  \n",
       "2        [geometry, math]       False            50  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.set_option(\"display.max_columns\", None)\n",
    "pdf.head(3)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
