{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "55a49d4b-2205-4090-b092-10e08a75b53f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import random\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f54eb3d0-159c-4e11-804a-98294d0d8f7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('../../data/code-translation/java-C#/data/train.java-cs.txt.java', 'r') as f:\n",
    "    javacodes = f.readlines()\n",
    "\n",
    "with open('../../data/code-translation/java-C#/data/train.java-cs.txt.cs', 'r') as f:\n",
    "    cscodes = f.readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "713046f6-bd54-4e53-940d-2efc32839a91",
   "metadata": {},
   "outputs": [],
   "source": [
    "rseeds = list(range(10))\n",
    "noise_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]\n",
    "n = len(javacodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bd0d5018-6b0c-412f-a03e-b3b81c67a4be",
   "metadata": {},
   "outputs": [],
   "source": [
    "noisy_data = {}\n",
    "\n",
    "for seed in rseeds:\n",
    "    \n",
    "    random.seed(seed)\n",
    "    np.random.seed(seed)\n",
    "    noisy_data[seed] = {}\n",
    "    \n",
    "    for noise in noise_levels:\n",
    "        \n",
    "        noisy_data[seed][noise] = {'java': [], 'csharp': []}\n",
    "        \n",
    "        n_noisy = int(noise * n)\n",
    "        n_correct = n - n_noisy\n",
    "\n",
    "        perm = np.random.permutation(n)\n",
    "        noisy_idxs = perm[:n_noisy]\n",
    "        correct_idxs = perm[n_noisy:]\n",
    "\n",
    "        # Add correct samples\n",
    "        for idx in correct_idxs:\n",
    "            javasample = javacodes[idx]\n",
    "            csample = cscodes[idx]\n",
    "            noisy_data[seed][noise]['java'].append(javasample)\n",
    "            noisy_data[seed][noise]['csharp'].append(csample)\n",
    "\n",
    "\n",
    "        # Add noisy samples\n",
    "        if n_noisy > 0:\n",
    "            noisy_idxs_shifted = np.array(list(noisy_idxs[1:]) + list(noisy_idxs[0:1]))\n",
    "\n",
    "            assert len(noisy_idxs) == len(noisy_idxs_shifted)\n",
    "            assert True not in [x==y for x,y in zip(noisy_idxs, noisy_idxs_shifted)]\n",
    "\n",
    "            for i, idx in enumerate(noisy_idxs):\n",
    "                javasample = javacodes[idx]\n",
    "                idx_noisy = noisy_idxs_shifted[i]\n",
    "                csample = cscodes[idx_noisy]\n",
    "                noisy_data[seed][noise]['java'].append(javasample)\n",
    "                noisy_data[seed][noise]['csharp'].append(csample)\n",
    "\n",
    "        assert len(noisy_data[seed][noise]['java']) == n\n",
    "        assert len(noisy_data[seed][noise]['csharp']) == n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "bf42531a-3af8-4f18-9c64-8c14e5519cb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "outdir = f\"./random-data/\"\n",
    "\n",
    "for seed, seeddata in noisy_data.items():\n",
    "    for noise, data in seeddata.items():\n",
    "        dirpath = os.path.join(outdir, f'seed-{seed}', f'noise-{noise}')\n",
    "        os.makedirs(dirpath)\n",
    "\n",
    "        java_fname = os.path.join(dirpath, f'noise-{noise}-train.java-cs.txt.java')\n",
    "        csharp_fname = os.path.join(dirpath, f'noise-{noise}-train.java-cs.txt.cs')\n",
    "\n",
    "        with open(java_fname, 'w') as f:\n",
    "            f.writelines(data['java'])\n",
    "        with open(csharp_fname, 'w') as f:\n",
    "            f.writelines(data['csharp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0750d6d-e399-4726-852c-29c3de015609",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3571ee18-ca68-451c-8075-cdc782038891",
   "metadata": {},
   "outputs": [],
   "source": [
    "outdir = \"./data\"\n",
    "\n",
    "for noise, data in noisy_data.items():\n",
    "    dirpath = os.path.join(outdir, f'noise-{noise}')\n",
    "    os.makedirs(dirpath)\n",
    "    \n",
    "    java_fname = os.path.join(dirpath, f'noise-{noise}-train.java-cs.txt.java')\n",
    "    csharp_fname = os.path.join(dirpath, f'noise-{noise}-train.java-cs.txt.cs')\n",
    "    \n",
    "    with open(java_fname, 'w') as f:\n",
    "        f.writelines(data['java'])\n",
    "    with open(csharp_fname, 'w') as f:\n",
    "        f.writelines(data['csharp'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e6d67484-c0ae-4a8c-bfa9-85652f4c93be",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([], dtype=int64)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "noisy_idxs_shifted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "82c565ea-2e87-49f8-b2e6-e354015777c4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10300"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "068d864f-f81d-4213-8e70-72c0de6dfa07",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
