{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cfa846c8-f716-4336-953c-c5fbd5740df8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import time\n",
    "import math\n",
    "import importlib\n",
    "import sys\n",
    "import torch\n",
    "from plotnine import *\n",
    "sys.path.insert(0, \"../../regLM/\")\n",
    "\n",
    "np.random.seed(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81c8a007-3062-4ab0-b339-692372fae314",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_table(\n",
    "    '/code/reggpt-applications/gosai_2023/SupTable 2 - UKBB_GTEX_CODA_averaged_no_cutoffs.txt'\n",
    ")\n",
    "print(len(data))\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84c6f24f-36a3-4e5d-bb91-1777f329af4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.nt_sequence = data.nt_sequence.astype(str)\n",
    "np.where([\"^\" in seq for seq in data.nt_sequence])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27ad66fa-c549-4228-ba84-dafeda87c02a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[345812, \"nt_sequence\"] = \"TGTAGAAAAAAATATATATATATATGAACAACGCATAATCCTGGAAATATAAGGAAAAATTAAATTTTCTCCTCTGGGAAAAATTTATACAGTAATGATTCTTGCTCTTTAATTTTTGTTTGAAAGAAATCTAGACATTTAAAAAACCCCAGTGGTAGAATTGTCTTGTTAAAAAGGGACATCAAGTAAAAGGCCAGGGG\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f3f7443-7b06-4d3b-848e-2fb577ff07f2",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data[['IDs', 'nt_sequence', 'HepG2_mean', 'K562_mean', 'SKNSH_mean', 'chr']].set_index('IDs')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2835de42-a0f1-4b67-8b7c-a74c8def7e63",
   "metadata": {},
   "source": [
    "## Split some data for the independent regression model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fe9b075b-eaa0-44e1-a54f-645648a683f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "reg = data.sample(120000)\n",
    "lm = data[~data.index.isin(reg.index)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1ce2d1fd-f948-4fe9-a940-478802b053be",
   "metadata": {},
   "source": [
    "## Train/val/test splits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee643e64-1ff6-4215-9c5f-7adb1cae3194",
   "metadata": {},
   "outputs": [],
   "source": [
    "reg_val = reg[reg.chr==7]\n",
    "reg_test = reg[reg.chr==13]\n",
    "reg_train = reg[~reg.chr.isin([7,13])]\n",
    "print(len(reg_train), len(reg_val), len(reg_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60d4174e-33f6-4c35-83a6-1ba61a89fa62",
   "metadata": {},
   "outputs": [],
   "source": [
    "lm_val = lm[lm.chr==7]\n",
    "lm_test = lm[lm.chr==13]\n",
    "lm_train = lm[~lm.chr.isin([7,13])]\n",
    "print(len(lm_train), len(lm_val), len(lm_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f2f96f44-7481-40ac-8fd0-6476fa339a91",
   "metadata": {},
   "outputs": [],
   "source": [
    "reg_train.to_csv('regression_separate_data/train.csv')\n",
    "reg_val.to_csv('regression_separate_data/val.csv')\n",
    "reg_test.to_csv('regression_separate_data/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06c3029d-804f-41be-a307-89de4431b818",
   "metadata": {},
   "outputs": [],
   "source": [
    "lm_train.to_csv('regression_lm_data/train.csv')\n",
    "lm_val.to_csv('regression_lm_data/val.csv')\n",
    "lm_test.to_csv('regression_lm_data/test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ac60732-6e7d-41fb-8c51-e03f279a879b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
