{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "76d021aa-2fa3-4f15-a261-e00744bbbc8b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import time\n",
    "import math\n",
    "import importlib\n",
    "import sys\n",
    "import torch\n",
    "from plotnine import *\n",
    "\n",
    "sys.path.insert(0, \"../../regLM\")\n",
    "import reglm.utils\n",
    "np.random.seed(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "016f8944-f311-419c-b644-8d0f1c8c45fe",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "670845 4761 2456\n"
     ]
    }
   ],
   "source": [
    "train = pd.read_csv('regression_lm_data/train.csv', index_col=0, usecols=(0,1,2,3,4))\n",
    "val = pd.read_csv('regression_lm_data/val.csv', index_col=0, usecols=(0,1,2,3,4))\n",
    "test = pd.read_csv('regression_lm_data/test.csv', index_col=0, usecols=(0,1,2,3,4))\n",
    "\n",
    "print(len(train), len(val), len(test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "821229a3-a895-412a-9076-ebc69560a7e2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "HepG2_mean [-0.23712702213947165, 0.05447821951007223, 0.3760034820636824, 0.9993251893856406]\n",
      "K562_mean [-0.23691146196964932, 0.03426816992833459, 0.33964092248457245, 1.0046957441353643]\n",
      "SKNSH_mean [-0.34472920630685955, 0.011347588118456035, 0.3686041198390259, 0.969303311919411]\n"
     ]
    }
   ],
   "source": [
    "train = reglm.utils.tokenize(train, cols=['HepG2_mean', 'K562_mean', 'SKNSH_mean'],n_bins=5,\n",
    "               names=['HepG2', 'K562', 'SKNSH'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "89f2bca6-87f3-4a67-be38-264a6f4aec7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "val = reglm.utils.tokenize(\n",
    "    val, \n",
    "    cols=['HepG2_mean', 'K562_mean', 'SKNSH_mean'],\n",
    "    n_bins=5,\n",
    "    names=['HepG2', 'K562', 'SKNSH'],\n",
    "    percentiles = {\n",
    "        'HepG2_mean': [-0.23712702213947165, 0.05447821951007223, 0.3760034820636824, 0.9993251893856406],\n",
    "'K562_mean': [-0.23691146196964932, 0.03426816992833459, 0.33964092248457245, 1.0046957441353643],\n",
    "'SKNSH_mean': [-0.34472920630685955, 0.011347588118456035, 0.3686041198390259, 0.969303311919411],\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "1c30341c-b35b-4370-9558-5826dd560e37",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = reglm.utils.tokenize(\n",
    "    test, \n",
    "    cols=['HepG2_mean', 'K562_mean', 'SKNSH_mean'],\n",
    "    n_bins=5,\n",
    "    names=['HepG2', 'K562', 'SKNSH'],\n",
    "    percentiles = {\n",
    "        'HepG2_mean': [-0.23712702213947165, 0.05447821951007223, 0.3760034820636824, 0.9993251893856406],\n",
    "'K562_mean': [-0.23691146196964932, 0.03426816992833459, 0.33964092248457245, 1.0046957441353643],\n",
    "'SKNSH_mean': [-0.34472920630685955, 0.011347588118456035, 0.3686041198390259, 0.969303311919411],\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "fe1351b8-4993-4cfb-8513-87c48b21d509",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.to_csv('lm_data/train.csv')\n",
    "val.to_csv('lm_data/val.csv')\n",
    "test.to_csv('lm_data/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fea047b7-8ec8-4f20-b189-32f9bf5d1e9c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
