{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82c50e22-b929-4e4a-8c21-3f09b0d2a900",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from plotnine import *\n",
    "\n",
    "import sys\n",
    "sys.path.append('../../')\n",
    "import scripts.evolve, scripts.sequence, scripts.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c60e3d5e-8228-4cee-868d-12eaaef68d8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.DataFrame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c7905e2-ec6f-4207-acc3-42af943e5e98",
   "metadata": {},
   "outputs": [],
   "source": [
    "gen = pd.read_csv('synthetic_promoters/lm_filtered_pred.csv', index_col=0, dtype={'label':'str'})\n",
    "gen = gen[gen.label=='44']\n",
    "gen['Group'] = 'regLM'\n",
    "gen = gen[['Sequence', 'Group', 'Complex', 'Defined']]\n",
    "gen.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac534b9d-ef98-47d2-88ea-b5aea865c782",
   "metadata": {},
   "outputs": [],
   "source": [
    "de = pd.read_csv('synthetic_promoters/DE_nn.csv', index_col=0, usecols=(0, 1, 2, 6,7,8))\n",
    "de['Group'] = 'DE'\n",
    "de = de[de.iter>8]\n",
    "de = de.rename(columns={'seq':'Sequence'})\n",
    "\n",
    "le = pd.read_csv('synthetic_promoters/LE_nn.csv', index_col=0)\n",
    "le['Group'] = 'LE'\n",
    "le = le.rename(columns={'seq':'Sequence'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70311883-40e2-4b30-a79b-ab8234cc35ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = match(de, gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a71294b5-f477-4e0b-b051-e7ca1354d5fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([all, m])\n",
    "len(all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48864c13-9d62-45ee-8bee-4be6e3a82274",
   "metadata": {},
   "outputs": [],
   "source": [
    "m = match(le, gen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5fe83a8a-5dc6-43ae-8575-634466b78b77",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([all, m])\n",
    "len(all)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6693a594-afc5-467f-b9e9-df94969f076d",
   "metadata": {},
   "source": [
    "## Save"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4ee7b77-0836-4992-90d7-af09b102da29",
   "metadata": {},
   "outputs": [],
   "source": [
    "all.reset_index(drop=True).to_csv('synthetic_promoters/comparison.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0aadb87b-c32b-4555-83b5-0e3e87654367",
   "metadata": {},
   "outputs": [],
   "source": [
    "to_plot = pd.concat([all, gen]).melt(id_vars=['Sequence', 'Group'], var_name='Medium')\n",
    "to_plot['Method']= to_plot.Group.map({'DE':'Evolution', 'LE':'Ledidi', 'regLM':'regLM'})\n",
    "to_plot['Method'] = pd.Categorical(to_plot['Method'], categories=['Evolution', 'Ledidi', 'regLM'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed894dea-68bf-4f28-9732-0eb28804d7e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(\n",
    "    to_plot[to_plot.Medium=='Complex'],\n",
    "    group_col='Method', value_col='value')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31f0d46c-bec2-48bb-8c5b-859c193530e9",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(\n",
    "    to_plot[to_plot.Medium=='Defined'],\n",
    "    group_col='Method', value_col='value')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dccfd849-f35e-4de3-9347-97ebb1e6f933",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    ggplot(to_plot,\n",
    "           aes(x=\"Medium\", y=\"value\", fill=\"Method\")) + geom_boxplot()\n",
    "    + theme_classic()\n",
    "    + theme(figure_size=(4,3))\n",
    "    + ylab(\"Predicted Expression\")\n",
    "    + annotate('text', x=1, y = 20, label='p = 0.80', size=9)\n",
    "    + annotate('text', x=2, y = 20, label='p = 0.47', size=9)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c948656-bcef-44d5-9ed3-f20ff3376555",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
