{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a105bfba",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import os\n",
    "import json\n",
    "import numpy as np\n",
    "from tqdm.notebook import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "e280cc77",
   "metadata": {},
   "outputs": [],
   "source": [
    "outdir = './random-data/codenet-jsonl-processed/'\n",
    "codenetdir = \"../Project_CodeNet\"\n",
    "\n",
    "exts = {\n",
    "    'Python': ['python', '3.9.4', 'py'],\n",
    "    'Java': ['java', '15.0.2', 'java'],\n",
    "    'Ruby': ['ruby', '3.0.1', 'rb'],\n",
    "    'JavaScript': ['javascript', '16.3.0', 'js'],\n",
    "    'PHP': ['php', '8.0.2', 'php'],\n",
    "    'C': ['c', '10.2.0', 'c'],\n",
    "    'C++': ['c++', '10.2.0', 'cpp'],\n",
    "    'C#': ['csharp', '5.0.201', 'cs'],\n",
    "    'Go': ['go', '1.16.2', 'go'],\n",
    "    'Scala': ['scala', '3.0.0', 'scala']\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "fa7dd8be",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(code, keep_ind):\n",
    "\n",
    "    newline_token = ' <NL> '\n",
    "    indent_token = '<IND> '\n",
    "\n",
    "    codelines = code.split('\\n')\n",
    "    codelines = [x for x in codelines if len(x) > 0]\n",
    "\n",
    "    if not keep_ind:\n",
    "        codelines = [x.strip() for x in codelines]\n",
    "    else:\n",
    "        codelines = [x.rstrip() for x in codelines]\n",
    "        lind = [len(x) - len(x.lstrip(' ')) for x in codelines]\n",
    "\n",
    "        try:\n",
    "            lind_min = min([x for x in lind if x > 0])\n",
    "        except:\n",
    "            lind_min = 0\n",
    "\n",
    "        final = []\n",
    "        for i, line in enumerate(codelines):\n",
    "            if lind_min > 0:\n",
    "                repl = int(lind[i] / lind_min)\n",
    "            else:\n",
    "                repl = 0\n",
    "            \n",
    "            replstr = indent_token * repl #' '.join([indent_token] * repl)\n",
    "            line = replstr + line.strip()\n",
    "            final.append(line)\n",
    "        \n",
    "        codelines = final\n",
    "    \n",
    "    code = newline_token.join(codelines)\n",
    "    return code\n",
    "\n",
    "def unprocess(code):\n",
    "\n",
    "    code = code.replace('<NL>', '\\n')\n",
    "    code = code.replace('<IND>', ' '*4)\n",
    "    \n",
    "    return code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "22114209",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./random-data/codenet-jsonl-processed/codepairs.pkl', 'rb') as f:\n",
    "    pairs = pickle.load(f)\n",
    "    \n",
    "print(list(pairs.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "2554464f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_code(prob, subid, lang):\n",
    "    \n",
    "    ext = exts[lang][2]\n",
    "    fpath = os.path.join(codenetdir, 'data', prob, lang, f'{subid}.{ext}')\n",
    "    with open(fpath, 'r') as f:\n",
    "        code = f.read().strip()\n",
    "    \n",
    "    code = preprocess(code, lang=='Python')\n",
    "    return code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "533e813d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0ad78a8c211348e19e0b42f9df13d52e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/7 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for langpair, vals in tqdm(pairs.items()):\n",
    "    \n",
    "    lang1, lang2 = langpair.split('-')\n",
    "    \n",
    "    dirpath = os.path.join(outdir, langpair)\n",
    "    os.makedirs(dirpath, exist_ok=True)\n",
    "    \n",
    "    train_fpath = os.path.join(dirpath, 'train.jsonl')\n",
    "    val_fpath = os.path.join(dirpath, 'val.jsonl')\n",
    "    \n",
    "    f_train = open(train_fpath, 'w')\n",
    "    f_val = open(val_fpath, 'w')\n",
    "    \n",
    "    perm = np.random.permutation(len(vals))\n",
    "    val_idxs = perm[:100]\n",
    "    train_idxs = perm[100:]\n",
    "    \n",
    "    for i, val in tqdm(enumerate(vals), leave=False):\n",
    "        prob, subid1, subid2 = val\n",
    "        srccode = get_code(prob, subid1, lang1)\n",
    "        tgtcode = get_code(prob, subid2, lang2)\n",
    "        \n",
    "        line = {\n",
    "            'src': srccode, 'tgt': tgtcode,\n",
    "            'prob': prob, 'code1_id': subid1, 'code2_id': subid2\n",
    "        }\n",
    "        \n",
    "        if i in train_idxs:\n",
    "            f_train.write(json.dumps(line) + '\\n')\n",
    "        elif i in val_idxs:\n",
    "            f_val.write(json.dumps(line) + '\\n')\n",
    "    \n",
    "    f_train.close()\n",
    "    f_val.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "deef04f7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Python-Java',\n",
       " 'JavaScript-C',\n",
       " 'C++-JavaScript',\n",
       " 'Java-C#',\n",
       " 'C++-C#',\n",
       " 'C#-PHP',\n",
       " 'Go-C++']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2a1c5b4d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
