{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import json\n",
    "import os"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "source": [
    "def preprocess(code, keep_ind):\n",
    "\n",
    "    newline_token = ' <NL> '\n",
    "    indent_token = '<IND> '\n",
    "\n",
    "    codelines = code.split('\\n')\n",
    "    codelines = [x for x in codelines if len(x) > 0]\n",
    "\n",
    "    if not keep_ind:\n",
    "        codelines = [x.strip() for x in codelines]\n",
    "    else:\n",
    "        codelines = [x.rstrip() for x in codelines]\n",
    "        lind = [len(x) - len(x.lstrip(' ')) for x in codelines]\n",
    "\n",
    "        try:\n",
    "            lind_min = min([x for x in lind if x > 0])\n",
    "        except:\n",
    "            lind_min = 0\n",
    "\n",
    "        final = []\n",
    "        for i, line in enumerate(codelines):\n",
    "            if lind_min > 0:\n",
    "                repl = int(lind[i] / lind_min)\n",
    "            else:\n",
    "                repl = 0\n",
    "            \n",
    "            replstr = indent_token * repl #' '.join([indent_token] * repl)\n",
    "            line = replstr + line.strip()\n",
    "            final.append(line)\n",
    "        \n",
    "        codelines = final\n",
    "    \n",
    "    code = newline_token.join(codelines)\n",
    "    return code\n",
    "\n",
    "def unprocess(code):\n",
    "\n",
    "    code = code.replace('<|NL|>', '\\n')\n",
    "    code = code.replace('<|IND|>', ' '*4)\n",
    "    \n",
    "    return code"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "source": [
    "infolder = \"data/codenet-jsonl\"\n",
    "outfolder = \"data/codenet-jsonl-processed\""
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "source": [
    "def process_jsonl(fpath, lang1, lang2):\n",
    "\n",
    "    result = []\n",
    "\n",
    "    with open(fpath, 'r') as f:\n",
    "        for line in f:\n",
    "            line = json.loads(line)\n",
    "            line['src'] = preprocess(line['src'], lang1=='Python')\n",
    "            line['tgt'] = preprocess(line['tgt'], lang2=='Python')\n",
    "            result.append(json.dumps(line))\n",
    "\n",
    "    return result"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "source": [
    "for folder in os.listdir(infolder):\n",
    "\n",
    "    try:\n",
    "        lang1 = folder.split('-')[0]\n",
    "        lang2 = folder.split('-')[1]\n",
    "    except:\n",
    "        continue\n",
    "\n",
    "    trainpath = os.path.join(infolder, folder, 'train.jsonl')\n",
    "    valpath = os.path.join(infolder, folder, 'val.jsonl')\n",
    "\n",
    "    fout = os.path.join(outfolder, folder)\n",
    "    os.makedirs(fout)\n",
    "\n",
    "    fout_train = os.path.join(fout, 'train.jsonl')\n",
    "    fout_val = os.path.join(fout, 'val.jsonl')\n",
    "\n",
    "    with open(fout_train, 'w') as f:\n",
    "        f.write('\\n'.join(process_jsonl(trainpath, lang1, lang2)))\n",
    "    with open(fout_val, 'w') as f:\n",
    "        f.write('\\n'.join(process_jsonl(valpath, lang1, lang2)))\n"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "source": [
    "print(\n",
    "    unprocess('import sys<|NL|>#+++++<|NL|><|NL|>def main():<|NL|>h, w = map(int, input().split())<|NL|>al = [list(map(int, input().split())) for i in range(h)]<|NL|><|NL|>al_i = [[0 for i in range(w)] for j in range(h)]<|NL|>al_j = [[0 for i in range(w)] for j in range(h)]<|NL|><|NL|>for i in range(h):<|NL|>al[i][0] = 1<|NL|>for j in range(w):<|NL|>al[i][j] = 1<|NL|><|NL|>for i in range(1, h')\n",
    ")"
   ],
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "import sys\n",
      "#+++++\n",
      "\n",
      "def main():\n",
      "h, w = map(int, input().split())\n",
      "al = [list(map(int, input().split())) for i in range(h)]\n",
      "\n",
      "al_i = [[0 for i in range(w)] for j in range(h)]\n",
      "al_j = [[0 for i in range(w)] for j in range(h)]\n",
      "\n",
      "for i in range(h):\n",
      "al[i][0] = 1\n",
      "for j in range(w):\n",
      "al[i][j] = 1\n",
      "\n",
      "for i in range(1, h\n"
     ]
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.8.8",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.8.8 64-bit ('wmd': conda)"
  },
  "interpreter": {
   "hash": "466cdeb947d5cda7f27dad06d4d5caffdbf3f921c283da2ea0bfbb043b4121b2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}