{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22a3d998-9189-46c7-afc6-065189ae84ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from plotnine import *\n",
    "\n",
    "import sys\n",
    "sys.path.append('../../')\n",
    "from scripts.sequence import min_edit_distance_from_reference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d395b3d-0b46-4302-9bbc-33dd2a95c904",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e8c7f0a-0f47-41ab-b1cb-84be83224c5c",
   "metadata": {},
   "source": [
    "## Match 400"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6a595e9-e7e8-4204-8262-97205d371e40",
   "metadata": {},
   "outputs": [],
   "source": [
    "de = pd.read_csv('synthetic_enhancers/DE_naa.csv', index_col=0)\n",
    "de['Group'] = 'DE'\n",
    "de['label'] = '400'\n",
    "de = de.rename(columns={'seq':'Sequence'})\n",
    "\n",
    "le = pd.read_csv('synthetic_enhancers/LE_naa.csv', index_col=0)\n",
    "le['Group'] = 'LE'\n",
    "le['label'] = '400'\n",
    "le = le.rename(columns={'seq':'Sequence'})\n",
    "\n",
    "gen = pd.read_csv('synthetic_enhancers/lm_filtered.csv', index_col=0, usecols=(0,1,2,3,4,5),dtype={'seq':'str','label':'str','hpred':'float', 'kpred':'float','spred':'float'})\n",
    "gen['Group'] = 'regLM'\n",
    "gen = gen[gen.label=='400']\n",
    "gen = gen.rename(columns={'seq':'Sequence'})\n",
    "all = pd.concat([all, gen])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ccd6843-e248-42f3-9614-830a4402ba59",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = match(de[de.iter == 10], gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee2bb78e-fd24-426c-a87f-02d58b5023c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([all, m])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b3af97d-b0fd-43c2-893e-327a91ba5956",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = match(le, gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3991cd6-e582-4e9d-bc27-636f1a8693de",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([all, m])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5dcb78c-7dea-4b7d-9e5d-a07036e61afa",
   "metadata": {},
   "source": [
    "## Match 040"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a22fc5c-864b-4ac2-916d-506733e02cbc",
   "metadata": {},
   "outputs": [],
   "source": [
    "de = pd.read_csv('synthetic_enhancers/DE_ana.csv', index_col=0)\n",
    "de['Group'] = 'DE'\n",
    "de['label'] = '040'\n",
    "de = de.rename(columns={'seq':'Sequence'})\n",
    "\n",
    "le = pd.read_csv('synthetic_enhancers/LE_ana.csv', index_col=0)\n",
    "le['Group'] = 'LE'\n",
    "le['label'] = '040'\n",
    "le = le.rename(columns={'seq':'Sequence'})\n",
    "\n",
    "gen = pd.read_csv('synthetic_enhancers/lm_filtered.csv', index_col=0, usecols=(0,1,2,3,4,5),dtype={'seq':'str','label':'str','hpred':'float', 'kpred':'float','spred':'float'})\n",
    "gen['Group'] = 'regLM'\n",
    "gen = gen[gen.label=='040']\n",
    "gen = gen.rename(columns={'seq':'Sequence'})\n",
    "all = pd.concat([all, gen])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ecc8573-5e68-4961-8cd5-4ea23e43ab37",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = match(de[de.iter==10], gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "401b4ecc-4970-44b2-abc3-b1c67e412437",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([all, m])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1e80701-f78a-4495-a9bb-7c35b05437fb",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = match(le, gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f8eec0e-c6e5-4185-99be-44583b82a6ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([all, m])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ec020d05-41a2-4391-ac2b-6a8ba7bc564d",
   "metadata": {},
   "source": [
    "## Match 004"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "866ddf79-63b2-4a01-aa8d-9329c89bf442",
   "metadata": {},
   "outputs": [],
   "source": [
    "de = pd.read_csv('synthetic_enhancers/DE_aan.csv', index_col=0)\n",
    "de['Group'] = 'DE'\n",
    "de['label'] = '004'\n",
    "de = de.rename(columns={'seq':'Sequence'})\n",
    "\n",
    "le = pd.read_csv('synthetic_enhancers/LE_aan.csv', index_col=0)\n",
    "le['Group'] = 'LE'\n",
    "le['label'] = '004'\n",
    "le = le.rename(columns={'seq':'Sequence'})\n",
    "\n",
    "gen = pd.read_csv('synthetic_enhancers/lm_filtered.csv', index_col=0, usecols=(0,1,2,3,4,5),dtype={'seq':'str','label':'str','hpred':'float', 'kpred':'float','spred':'float'})\n",
    "gen['Group'] = 'regLM'\n",
    "gen = gen[gen.label=='004']\n",
    "gen = gen.rename(columns={'seq':'Sequence'})\n",
    "all = pd.concat([all, gen])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8922f999-3e04-48c8-820b-9022bead9ca7",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = match(de[de.iter==10], gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7580fe87-2deb-4ec6-811e-21dc95c62e15",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([all, m])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "811ac84c-e46b-45a4-b24c-bfe9cc966e62",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = match(le, gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc353c24-a670-4b9a-b96c-b08f60fe3303",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([all, m])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5db92dfb-7b15-4db3-9648-20f73b827b44",
   "metadata": {},
   "outputs": [],
   "source": [
    "all.reset_index(drop=True).to_csv('synthetic_enhancers/comparison.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d82109ff-784f-4ab2-9001-d997ce672ce8",
   "metadata": {},
   "outputs": [],
   "source": [
    "all['Method'] = all.Group.map({'DE':'Evolution', 'LE':'Ledidi', 'regLM':'regLM'})\n",
    "all['Specificity'] = all.label.map({\n",
    "    '400':'HepG2-specific', '040':'K562-specific', '004':'SKNSH-specific'\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22338fce-3a27-41b2-94db-e55f8ae8d083",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = all[['Sequence', 'Method', 'Specificity', 'hpred', 'kpred', 'spred']]\n",
    "all = all.melt(id_vars=['Sequence', 'Method', 'Specificity'], var_name='Cell Line')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb27864e-fed4-404a-be4f-df4e0d0b7602",
   "metadata": {},
   "outputs": [],
   "source": [
    "all['Cell Line'] = all['Cell Line'].map({\n",
    "    'hpred':'HepG2', 'kpred':'K562', 'spred':'SKNSH'\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7d01298-0936-4fd1-ac5f-98ab875e2044",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    ggplot(all, aes(fill='Method', y=\"value\", x=\"Cell Line\"))\n",
    "    +facet_wrap(\"Specificity\") \n",
    "    + geom_boxplot(outlier_size=.1, size=.3)\n",
    "    +theme_classic()\n",
    "    + theme(figure_size=(6,3))\n",
    "    + ylab(\"Predicted expression\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc255655-c41d-4910-b4f2-d3e9cc6ab580",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
