{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c1b5c85a-4e3b-444d-a7bb-70bd6308fd30",
   "metadata": {},
   "source": [
    "# Process Vaishnav et al. data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4ec010db-59c0-4571-ab55-d1b81e442a60",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os\n",
    "import importlib\n",
    "import sys\n",
    "from plotnine import *\n",
    "np.random.seed(0)\n",
    "\n",
    "sys.path.append(\"../../\")\n",
    "import scripts.sequence"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cc8f2936-5ea9-48f9-8851-7b2c7ff03dd3",
   "metadata": {},
   "source": [
    "### Load all measurements in complex and defined medium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "85edf681-d332-4f22-85ad-55c7606b138d",
   "metadata": {},
   "outputs": [],
   "source": [
    "complex = pd.read_table('/gstore/data/resbioai/lala8/vaishnav_2022/complex_media_training_data_Glu.txt',\n",
    "                     header=None, names=['Sequence', 'exp'])\n",
    "\n",
    "defined = pd.read_table('/gstore/data/resbioai/lala8/vaishnav_2022/defined_media_training_data_SC_Ura.txt',\n",
    "                     header=None, names=['Sequence', 'exp'])\n",
    "defined.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67470cb8-38d5-4867-9e43-d33afda72219",
   "metadata": {},
   "outputs": [],
   "source": [
    "len(complex), len(defined)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4e9a6e37-69ed-4fad-ab36-094ad1d9e6bf",
   "metadata": {},
   "source": [
    "### Process and filter random sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce55e8ca-7393-4652-acc1-db899fb7be26",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop flanks\n",
    "complex.Sequence = complex.Sequence.apply(lambda x: x[17:-13])\n",
    "defined.Sequence = defined.Sequence.apply(lambda x: x[17:-13])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7a1aaa5-0c61-4344-8ac8-ddd03b3d3590",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop Ns and filter to sequence length 80 bp\n",
    "complex = scripts.sequence.drop_Ns(complex, seq_col='Sequence')\n",
    "complex = scripts.sequence.select_length(complex, seq_col='Sequence', target=80)\n",
    "\n",
    "defined = scripts.sequence.drop_Ns(defined, seq_col='Sequence')\n",
    "defined = scripts.sequence.select_length(defined, seq_col='Sequence', target=80)\n",
    "\n",
    "print(len(complex), len(defined))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1bed39d0-3e6a-42c2-a554-5782defaeb40",
   "metadata": {},
   "source": [
    "### Split sequences measured in both media from those measured in only one medium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d4dfbd4-c45c-44fe-974b-151f02de09bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "random_overlap = set(complex.Sequence).intersection(set(defined.Sequence))\n",
    "\n",
    "a = complex[complex.Sequence.isin(random_overlap)]\n",
    "b = defined[defined.Sequence.isin(random_overlap)]\n",
    "random_both = a.merge(b, on='Sequence', how='inner', suffixes=('_complex', '_defined'))\n",
    "\n",
    "len(random_both)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d15ceec7-e4ff-4eb2-a48f-f84aa89a0f77",
   "metadata": {},
   "outputs": [],
   "source": [
    "random_complex = complex[~complex.Sequence.isin(random_overlap)]\n",
    "random_defined = defined[~defined.Sequence.isin(random_overlap)]\n",
    "\n",
    "len(random_complex), len(random_defined)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c628a78-6a5a-4f7c-9b88-a4ab080e7549",
   "metadata": {},
   "source": [
    "### Assign unique indices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a476344-c70e-4845-a062-f32a1870b0ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "random_both.index = ['rb_' + str(x) for x in range(len(random_both))]\n",
    "random_complex.index = ['rc_' + str(x) for x in range(len(random_complex))]\n",
    "random_defined.index = ['rd_' + str(x) for x in range(len(random_defined))]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "990e8c67-d63d-40a1-bbd2-e54188788c19",
   "metadata": {},
   "source": [
    "### Save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22827849-8645-454e-8dc6-5de497ac7d59",
   "metadata": {},
   "outputs": [],
   "source": [
    "random_both.to_csv('processed_data/random_both.csv')\n",
    "random_complex.to_csv('processed_data/random_complex.csv')\n",
    "random_defined.to_csv('processed_data/random_defined.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee789e74-071b-43ec-9486-91eacc0222c6",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
