{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
    "import os\n",
    "import pandas as pd\n",
    "from tqdm.notebook import tqdm\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "supported_langs = ['Ruby', 'JavaScript', 'Go', 'Python', 'Java', 'PHP']\n",
    "metadata_dir = \"../Project_CodeNet/metadata/\"\n",
    "codenet_langs = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/graphcodebert-base\")\n",
    "\n",
    "def tokenize(code):\n",
    "    RM = ['Ċ', 'Ġ']\n",
    "    tokens = [x for x in tokenizer.tokenize(code) if x not in RM]\n",
    "    return len(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "metadata_files = [os.path.join(metadata_dir, x) for x in os.listdir(metadata_dir) if x.endswith('.csv') and x != 'problem_list.csv']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b52d0c6c790f4bbeb01510e5ffd2658a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/4053 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Token indices sequence length is longer than the specified maximum sequence length for this model (850 > 512). Running this sequence through the model will result in indexing errors\n"
     ]
    }
   ],
   "source": [
    "datadf = None\n",
    "\n",
    "for fpath in tqdm(metadata_files):\n",
    "    df = pd.read_csv(fpath)\n",
    "    df = df[df['status'] == 'Accepted']\n",
    "    df.reset_index(inplace=True)\n",
    "    df['token_len'] = 0\n",
    "    \n",
    "    \n",
    "    df_grps = df.groupby(['language'])\n",
    "    \n",
    "    for lang, idxs in df_grps.groups.items():\n",
    "        \n",
    "        if lang not in supported_langs:\n",
    "            continue\n",
    "            \n",
    "        temp = df.loc[idxs]\n",
    "        \n",
    "        for irow, datum in temp.iterrows():\n",
    "            submission_id = datum['submission_id']\n",
    "            problem_id = datum['problem_id']\n",
    "            ext = datum['filename_ext']\n",
    "            lang = datum['language']\n",
    "\n",
    "            src_fpath = f\"../Project_CodeNet/data/{problem_id}/{lang}/{submission_id}.{ext}\"\n",
    "            \n",
    "            with open(src_fpath, 'r') as f:\n",
    "                code = f.read()\n",
    "            \n",
    "            toklen = tokenize(code)\n",
    "            temp.at[irow, 'token_len'] = toklen\n",
    "            temp\n",
    "        \n",
    "        if datadf is None:\n",
    "            datadf = temp.copy()\n",
    "        else:\n",
    "            datadf = datadf.append(temp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>index</th>\n",
       "      <th>submission_id</th>\n",
       "      <th>problem_id</th>\n",
       "      <th>user_id</th>\n",
       "      <th>date</th>\n",
       "      <th>language</th>\n",
       "      <th>original_language</th>\n",
       "      <th>filename_ext</th>\n",
       "      <th>status</th>\n",
       "      <th>cpu_time</th>\n",
       "      <th>memory</th>\n",
       "      <th>code_size</th>\n",
       "      <th>accuracy</th>\n",
       "      <th>token_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10</td>\n",
       "      <td>s166413314</td>\n",
       "      <td>p01916</td>\n",
       "      <td>u507118101</td>\n",
       "      <td>1490318310</td>\n",
       "      <td>Java</td>\n",
       "      <td>JAVA</td>\n",
       "      <td>java</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>50.0</td>\n",
       "      <td>24480.0</td>\n",
       "      <td>1996</td>\n",
       "      <td>35/35</td>\n",
       "      <td>773</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>11</td>\n",
       "      <td>s975413870</td>\n",
       "      <td>p01916</td>\n",
       "      <td>u473115569</td>\n",
       "      <td>1527404250</td>\n",
       "      <td>Java</td>\n",
       "      <td>JAVA</td>\n",
       "      <td>java</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>60.0</td>\n",
       "      <td>26244.0</td>\n",
       "      <td>399</td>\n",
       "      <td>35/35</td>\n",
       "      <td>125</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>12</td>\n",
       "      <td>s558704143</td>\n",
       "      <td>p01916</td>\n",
       "      <td>u473115569</td>\n",
       "      <td>1527404877</td>\n",
       "      <td>Java</td>\n",
       "      <td>JAVA</td>\n",
       "      <td>java</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>70.0</td>\n",
       "      <td>26344.0</td>\n",
       "      <td>550</td>\n",
       "      <td>35/35</td>\n",
       "      <td>199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>13</td>\n",
       "      <td>s950483083</td>\n",
       "      <td>p01916</td>\n",
       "      <td>u507781558</td>\n",
       "      <td>1555378387</td>\n",
       "      <td>Java</td>\n",
       "      <td>JAVA</td>\n",
       "      <td>java</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>40.0</td>\n",
       "      <td>24524.0</td>\n",
       "      <td>3737</td>\n",
       "      <td>35/35</td>\n",
       "      <td>1513</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>s743314309</td>\n",
       "      <td>p01916</td>\n",
       "      <td>u540218629</td>\n",
       "      <td>1523006160</td>\n",
       "      <td>JavaScript</td>\n",
       "      <td>JavaScript</td>\n",
       "      <td>js</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>80.0</td>\n",
       "      <td>15388.0</td>\n",
       "      <td>371</td>\n",
       "      <td>35/35</td>\n",
       "      <td>161</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>158</th>\n",
       "      <td>171</td>\n",
       "      <td>s416402934</td>\n",
       "      <td>p00387</td>\n",
       "      <td>u717526540</td>\n",
       "      <td>1544596049</td>\n",
       "      <td>Python</td>\n",
       "      <td>Python3</td>\n",
       "      <td>py</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>20.0</td>\n",
       "      <td>5596.0</td>\n",
       "      <td>142</td>\n",
       "      <td>16/16</td>\n",
       "      <td>47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>159</th>\n",
       "      <td>172</td>\n",
       "      <td>s315001932</td>\n",
       "      <td>p00387</td>\n",
       "      <td>u352394527</td>\n",
       "      <td>1543738216</td>\n",
       "      <td>Python</td>\n",
       "      <td>Python3</td>\n",
       "      <td>py</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>20.0</td>\n",
       "      <td>5580.0</td>\n",
       "      <td>61</td>\n",
       "      <td>16/16</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>156</td>\n",
       "      <td>s255718155</td>\n",
       "      <td>p00387</td>\n",
       "      <td>u797180951</td>\n",
       "      <td>1583683157</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>rb</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>40.0</td>\n",
       "      <td>6872.0</td>\n",
       "      <td>47</td>\n",
       "      <td>16/16</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>145</th>\n",
       "      <td>157</td>\n",
       "      <td>s011018149</td>\n",
       "      <td>p00387</td>\n",
       "      <td>u247371045</td>\n",
       "      <td>1553208399</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>rb</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>40.0</td>\n",
       "      <td>6872.0</td>\n",
       "      <td>54</td>\n",
       "      <td>16/16</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>158</td>\n",
       "      <td>s973326482</td>\n",
       "      <td>p00387</td>\n",
       "      <td>u300645821</td>\n",
       "      <td>1543360422</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>Ruby</td>\n",
       "      <td>rb</td>\n",
       "      <td>Accepted</td>\n",
       "      <td>40.0</td>\n",
       "      <td>6864.0</td>\n",
       "      <td>38</td>\n",
       "      <td>16/16</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2394784 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     index submission_id problem_id     user_id        date    language  \\\n",
       "6       10    s166413314     p01916  u507118101  1490318310        Java   \n",
       "7       11    s975413870     p01916  u473115569  1527404250        Java   \n",
       "8       12    s558704143     p01916  u473115569  1527404877        Java   \n",
       "9       13    s950483083     p01916  u507781558  1555378387        Java   \n",
       "0        0    s743314309     p01916  u540218629  1523006160  JavaScript   \n",
       "..     ...           ...        ...         ...         ...         ...   \n",
       "158    171    s416402934     p00387  u717526540  1544596049      Python   \n",
       "159    172    s315001932     p00387  u352394527  1543738216      Python   \n",
       "144    156    s255718155     p00387  u797180951  1583683157        Ruby   \n",
       "145    157    s011018149     p00387  u247371045  1553208399        Ruby   \n",
       "146    158    s973326482     p00387  u300645821  1543360422        Ruby   \n",
       "\n",
       "    original_language filename_ext    status  cpu_time   memory  code_size  \\\n",
       "6                JAVA         java  Accepted      50.0  24480.0       1996   \n",
       "7                JAVA         java  Accepted      60.0  26244.0        399   \n",
       "8                JAVA         java  Accepted      70.0  26344.0        550   \n",
       "9                JAVA         java  Accepted      40.0  24524.0       3737   \n",
       "0          JavaScript           js  Accepted      80.0  15388.0        371   \n",
       "..                ...          ...       ...       ...      ...        ...   \n",
       "158           Python3           py  Accepted      20.0   5596.0        142   \n",
       "159           Python3           py  Accepted      20.0   5580.0         61   \n",
       "144              Ruby           rb  Accepted      40.0   6872.0         47   \n",
       "145              Ruby           rb  Accepted      40.0   6872.0         54   \n",
       "146              Ruby           rb  Accepted      40.0   6864.0         38   \n",
       "\n",
       "    accuracy  token_len  \n",
       "6      35/35        773  \n",
       "7      35/35        125  \n",
       "8      35/35        199  \n",
       "9      35/35       1513  \n",
       "0      35/35        161  \n",
       "..       ...        ...  \n",
       "158    16/16         47  \n",
       "159    16/16         24  \n",
       "144    16/16         22  \n",
       "145    16/16         27  \n",
       "146    16/16         24  \n",
       "\n",
       "[2394784 rows x 14 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "datadf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
