{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3777e7f9-9cb9-41aa-8a75-073a181cbefd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "from tqdm.notebook import tqdm\n",
    "import pickle\n",
    "import numpy as np\n",
    "import random\n",
    "import shutil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "70ce02e3-a797-4335-a68d-170269fe6f2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "random.seed(0)\n",
    "np.random.seed(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "5f075081-812f-423d-ad4b-3d72c3b372a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "ntasks = 50\n",
    "prb_pertask = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5f490ac9-9e98-46c8-918e-b4e1ce083f95",
   "metadata": {},
   "outputs": [],
   "source": [
    "supported_langs = ['Ruby', 'JavaScript', 'Go', 'Python', 'Java', 'PHP']\n",
    "metadata_dir = \"../Project_CodeNet/metadata/\"\n",
    "codenet_langs = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f399770f-ef82-4b72-b509-afdf701f33f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "problems = ['p00389', 'p00595', 'p03493', 'p02256', 'p03470', 'p02392', 'p02391', 'p03814', 'p02393', 'p02390', 'p04043', 'p03456', 'p02389', 'p04011', 'p02388', 'p02946', 'p02399', 'p02969', 'p02915', 'p02394', 'p04029', 'p04044', 'p02999', 'p03433', 'p03997', 'p02909', 'p03759', 'p03693', 'p03631', 'p02400', 'p02403', 'p03219', 'p03242', 'p03135', 'p02418', 'p02379', 'p03605', 'p03813', 'p02396', 'p02415', 'p02397', 'p02398', 'p03227', 'p03623', 'p03729', 'p03852', 'p02402', 'p00134', 'p00019', 'p02879', 'p03210', 'p03671', 'p03485', 'p03192', 'p03447', 'p03697', 'p03860', 'p03469', 'p03635', 'p03501', 'p03563', 'p03643', 'p03597', 'p02859', 'p00013', 'p02416', 'p03826', 'p03773', 'p00006', 'p02407', 'p03777', 'p03477', 'p03844', 'p00000', 'p03795', 'p00024', 'p03473', 'p03943', 'p00575', 'p03834', 'p03556', 'p00007', 'p00014', 'p00158', 'p00018', 'p00075', 'p00564', 'p00095', 'p00553', 'p00001', 'p00057', 'p00073', 'p00101', 'p00433', 'p00020', 'p00080', 'p00046', 'p00173', 'p00352', 'p00094']\n",
    "metadata_files = [os.path.join(metadata_dir, x + '.csv') for x in problems[:10]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "f7fb3a60-ca3c-4ffe-8837-5c195fbf999a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7bd74af0f5dd4627a98f0746817980b9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/10 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "datadf = None\n",
    "\n",
    "for fpath in tqdm(metadata_files):\n",
    "    df = pd.read_csv(fpath)\n",
    "    df = df[df['status'] == 'Accepted']\n",
    "    df.reset_index(inplace=True)\n",
    "    \n",
    "    \n",
    "    df_grps = df.groupby(['language'])\n",
    "    \n",
    "    for lang, idxs in df_grps.groups.items():\n",
    "        # temp = df.loc[idxs]\n",
    "        # temp = temp.sort_values('code_size')[:prb_pertask]\n",
    "        \n",
    "        if prb_pertask is not None:\n",
    "            idxs_sub = np.random.permutation(idxs)[:prb_pertask]\n",
    "            temp = df.loc[idxs_sub]\n",
    "        else:\n",
    "            temp = df.loc[idxs]\n",
    "        \n",
    "        if datadf is None:\n",
    "            datadf = temp.copy()\n",
    "        else:\n",
    "            datadf = datadf.append(temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "27934eea-0208-41f7-a492-79d42204dec1",
   "metadata": {},
   "outputs": [],
   "source": [
    "langcounts = datadf[['index', 'language']].groupby(['language']).count()\n",
    "langs = langcounts[langcounts['index'] > 2000].index.tolist()\n",
    "langs\n",
    "\n",
    "langs = supported_langs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "6970e8a1-3708-4a8e-b0b5-1285504dc066",
   "metadata": {},
   "outputs": [],
   "source": [
    "datadf_final = datadf[datadf['language'].isin(langs)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a8692ab2-980c-4f2b-8951-b2494059d371",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>language</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Go</th>\n",
       "      <td>964</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Java</th>\n",
       "      <td>7947</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JavaScript</th>\n",
       "      <td>862</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PHP</th>\n",
       "      <td>622</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Python</th>\n",
       "      <td>22099</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Ruby</th>\n",
       "      <td>2872</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "            index\n",
       "language         \n",
       "Go            964\n",
       "Java         7947\n",
       "JavaScript    862\n",
       "PHP           622\n",
       "Python      22099\n",
       "Ruby         2872"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datadf_final[['index', 'language']].groupby(['language']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f27341d3-0a11-4a78-bb3c-873a5cfd4417",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>submission_id</th>\n",
       "      <th>problem_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>date</th>\n",
       "      <th>language</th>\n",
       "      <th>original_language</th>\n",
       "      <th>filename_ext</th>\n",
       "      <th>status</th>\n",
       "      <th>cpu_time</th>\n",
       "      <th>memory</th>\n",
       "      <th>code_size</th>\n",
       "      <th>accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>101</td>\n",
       "      <td>s861811189</td>\n",
       "      <td>p00389</td>\n",
       "      <td>u198339703</td>\n",
       "      <td>1575298620</td>\n",
       "      <td>Go</td>\n",
       "      <td>Go</td>\n",
       "      <td>go</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1156.0</td>\n",
       "      <td>254</td>\n",
       "      <td>20/20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>s433619182</td>\n",
       "      <td>p00389</td>\n",
       "      <td>u788025930</td>\n",
       "      <td>1588319933</td>\n",
       "      <td>Java</td>\n",
       "      <td>JAVA</td>\n",
       "      <td>java</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>80.0</td>\n",
       "      <td>26300.0</td>\n",
       "      <td>456</td>\n",
       "      <td>20/20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>s061491069</td>\n",
       "      <td>p00389</td>\n",
       "      <td>u561121048</td>\n",
       "      <td>1572943692</td>\n",
       "      <td>Java</td>\n",
       "      <td>JAVA</td>\n",
       "      <td>java</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>90.0</td>\n",
       "      <td>26412.0</td>\n",
       "      <td>788</td>\n",
       "      <td>20/20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>s591786574</td>\n",
       "      <td>p00389</td>\n",
       "      <td>u805699996</td>\n",
       "      <td>1552289482</td>\n",
       "      <td>Java</td>\n",
       "      <td>JAVA</td>\n",
       "      <td>java</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>60.0</td>\n",
       "      <td>26372.0</td>\n",
       "      <td>441</td>\n",
       "      <td>20/20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>s494746024</td>\n",
       "      <td>p00389</td>\n",
       "      <td>u540218629</td>\n",
       "      <td>1556686052</td>\n",
       "      <td>JavaScript</td>\n",
       "      <td>JavaScript</td>\n",
       "      <td>js</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>90.0</td>\n",
       "      <td>15160.0</td>\n",
       "      <td>281</td>\n",
       "      <td>20/20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10745</th>\n",
       "      <td>18576</td>\n",
       "      <td>s376469439</td>\n",
       "      <td>p02390</td>\n",
       "      <td>u207046604</td>\n",
       "      <td>1532616394</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>rb</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>50.0</td>\n",
       "      <td>6860.0</td>\n",
       "      <td>103</td>\n",
       "      <td>10/10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10746</th>\n",
       "      <td>18577</td>\n",
       "      <td>s673831237</td>\n",
       "      <td>p02390</td>\n",
       "      <td>u285844567</td>\n",
       "      <td>1532584132</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>rb</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>40.0</td>\n",
       "      <td>8880.0</td>\n",
       "      <td>95</td>\n",
       "      <td>10/10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747</th>\n",
       "      <td>18578</td>\n",
       "      <td>s287886084</td>\n",
       "      <td>p02390</td>\n",
       "      <td>u069741120</td>\n",
       "      <td>1532406600</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>rb</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>50.0</td>\n",
       "      <td>6860.0</td>\n",
       "      <td>89</td>\n",
       "      <td>10/10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10748</th>\n",
       "      <td>18579</td>\n",
       "      <td>s795595369</td>\n",
       "      <td>p02390</td>\n",
       "      <td>u624226399</td>\n",
       "      <td>1476622784</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>rb</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>40.0</td>\n",
       "      <td>8600.0</td>\n",
       "      <td>88</td>\n",
       "      <td>10/10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10749</th>\n",
       "      <td>18580</td>\n",
       "      <td>s904151217</td>\n",
       "      <td>p02390</td>\n",
       "      <td>u476445487</td>\n",
       "      <td>1427295443</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>rb</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>20.0</td>\n",
       "      <td>6092.0</td>\n",
       "      <td>112</td>\n",
       "      <td>10/10</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>35366 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       index submission_id problem_id     user_id        date    language  \\\n",
       "91       101    s861811189     p00389  u198339703  1575298620          Go   \n",
       "6          7    s433619182     p00389  u788025930  1588319933        Java   \n",
       "7          8    s061491069     p00389  u561121048  1572943692        Java   \n",
       "8          9    s591786574     p00389  u805699996  1552289482        Java   \n",
       "0          0    s494746024     p00389  u540218629  1556686052  JavaScript   \n",
       "...      ...           ...        ...         ...         ...         ...   \n",
       "10745  18576    s376469439     p02390  u207046604  1532616394        Ruby   \n",
       "10746  18577    s673831237     p02390  u285844567  1532584132        Ruby   \n",
       "10747  18578    s287886084     p02390  u069741120  1532406600        Ruby   \n",
       "10748  18579    s795595369     p02390  u624226399  1476622784        Ruby   \n",
       "10749  18580    s904151217     p02390  u476445487  1427295443        Ruby   \n",
       "\n",
       "      original_language filename_ext    status  cpu_time   memory  code_size  \\\n",
       "91                   Go           go  Accepted       0.0   1156.0        254   \n",
       "6                  JAVA         java  Accepted      80.0  26300.0        456   \n",
       "7                  JAVA         java  Accepted      90.0  26412.0        788   \n",
       "8                  JAVA         java  Accepted      60.0  26372.0        441   \n",
       "0            JavaScript           js  Accepted      90.0  15160.0        281   \n",
       "...                 ...          ...       ...       ...      ...        ...   \n",
       "10745              Ruby           rb  Accepted      50.0   6860.0        103   \n",
       "10746              Ruby           rb  Accepted      40.0   8880.0         95   \n",
       "10747              Ruby           rb  Accepted      50.0   6860.0         89   \n",
       "10748              Ruby           rb  Accepted      40.0   8600.0         88   \n",
       "10749              Ruby           rb  Accepted      20.0   6092.0        112   \n",
       "\n",
       "      accuracy  \n",
       "91       20/20  \n",
       "6        20/20  \n",
       "7        20/20  \n",
       "8        20/20  \n",
       "0        20/20  \n",
       "...        ...  \n",
       "10745    10/10  \n",
       "10746    10/10  \n",
       "10747    10/10  \n",
       "10748    10/10  \n",
       "10749    10/10  \n",
       "\n",
       "[35366 rows x 13 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datadf_final"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "228c9c55-0167-41fc-bf05-536f6eaa0968",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "35366"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datadf_final['submission_id'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c6ef9c8d-55a7-49f4-88c6-03dacd5edf91",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9fa85047f1174707a61108b01855c73a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "outdir = \"../codenet-genai/matchexp_data\"\n",
    "\n",
    "for _, datum in tqdm(datadf_final.iterrows()):\n",
    "    submission_id = datum['submission_id']\n",
    "    problem_id = datum['problem_id']\n",
    "    ext = datum['filename_ext']\n",
    "    lang = datum['language']\n",
    "    \n",
    "    src_fpath = f\"../Project_CodeNet/data/{problem_id}/{lang}/{submission_id}.{ext}\"\n",
    "    dest_folder = f\"{outdir}/{lang}/{problem_id}\"\n",
    "    \n",
    "    os.makedirs(dest_folder, exist_ok=True)\n",
    "    shutil.copy(src_fpath, dest_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c683831-4b7d-4864-b5f8-1fbf38a863f1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4239b9be-6e4c-46f9-ae1a-cfaafd031bcc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
