{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92d8c8ce-4e69-4901-a1ce-d659468326ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from plotnine import *\n",
    "import torch\n",
    "sys.path.append('../../')\n",
    "import scripts.regression, scripts.evolve"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b8f9d8d-b829-4089-98c0-6700b8607b2f",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = scripts.regression.LightningModel.load_from_checkpoint(\n",
    "    'reg_lm/lightning_logs/version_0/checkpoints/epoch=7-step=20968.ckpt'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ad8a7bc5-be38-4781-8688-c8b39361caa2",
   "metadata": {},
   "outputs": [],
   "source": [
    "imodel = scripts.regression.LightningModel.load_from_checkpoint(\n",
    "    'reg_separate/lightning_logs/version_0/checkpoints/epoch=4-step=4635.ckpt'\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eade41d9-9cb3-49a5-988d-e4827fbed9ec",
   "metadata": {},
   "source": [
    "## Get starting sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4e15ea4e-02b6-46b4-8db7-d58c48a04017",
   "metadata": {},
   "outputs": [],
   "source": [
    "test2 = pd.read_csv('regression_separate_data/test.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffc2d2fe-8226-4d73-a81c-49b5b1369f17",
   "metadata": {},
   "outputs": [],
   "source": [
    "test2 = test2[\n",
    "(test2.HepG2_mean < -0.23712702) & (test2.K562_mean<-0.23691146) & (test2.SKNSH_mean<-0.34472921)\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ad392f9-7c5c-4f04-aef1-eee5b770db5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "start_seqs = pd.DataFrame(test.nt_sequence.tolist() + test2.nt_sequence.tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "256dbd07-829e-4891-912a-b8eb0da0b442",
   "metadata": {},
   "outputs": [],
   "source": [
    "start_seqs['len'] = start_seqs[0].apply(len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9c371275-bb33-43c7-bcee-bab5247cb5d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "start_seqs = start_seqs[start_seqs.len==200]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb5b1264-b5aa-41ac-9d64-51db67b57387",
   "metadata": {},
   "outputs": [],
   "source": [
    "start_seqs = start_seqs[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80868324-1763-4b44-bd27-e39682dfe1f6",
   "metadata": {},
   "outputs": [],
   "source": [
    "start_seqs.to_csv('synthetic_enhancers/start_seqs.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "444f5d2d-1f94-4e74-a1fc-c3e70022096d",
   "metadata": {},
   "source": [
    "## Evolve cell type specific enhancers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0046fc1c-c9c7-47c5-b9f4-08dc7198c8f9",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "naa_df = pd.DataFrame()\n",
    "\n",
    "for i, seq in enumerate(start_seqs.iloc[:, 0].tolist()):\n",
    "    df = scripts.evolve.evolve([seq], model, to_max=[0], to_min=[1, 2], device=0)\n",
    "    df['start_seq'] = i\n",
    "    naa_df = pd.concat([naa_df, df])\n",
    "    \n",
    "naa_df[['seq', 'iter', 'start_seq']].reset_index(drop=True).to_csv('synthetic_enhancers/DE_naa.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c610e4f-0d88-41f9-a546-fb09834dbb4e",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "ana_df = pd.DataFrame()\n",
    "for i, seq in enumerate(start_seqs.iloc[:, 0].tolist()):\n",
    "    df = scripts.evolve.evolve([seq], model, to_max=[1], to_min=[0, 2], device=0)\n",
    "    df['start_seq'] = i\n",
    "    ana_df = pd.concat([ana_df, df])\n",
    "\n",
    "ana_df[['seq', 'iter', 'start_seq']].reset_index(drop=True).to_csv('synthetic_enhancers/DE_ana.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bbc0f49a-c232-48f4-8e3a-9b696d2d7ef9",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "aan_df = pd.DataFrame()\n",
    "\n",
    "for i, seq in enumerate(start_seqs.iloc[:, 0].tolist()):\n",
    "    df = scripts.evolve.evolve([seq], model, to_max=[2], to_min=[1, 0], device=0)\n",
    "    df['start_seq'] = i\n",
    "    aan_df = pd.concat([aan_df, df])\n",
    "\n",
    "aan_df[['seq', 'iter', 'start_seq']].reset_index(drop=True).to_csv(\n",
    "    'synthetic_enhancers/DE_aan.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6b5852b-4906-4ed0-9c39-524097dd3f08",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e6bb1dc-0a63-4bfb-b8ba-0ab2ab3920fd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
