{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1b7e5f72-91e6-43c8-9422-22852158738d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sys\n",
    "import os\n",
    "import anndata\n",
    "import scanpy as sc\n",
    "import importlib\n",
    "\n",
    "sys.path.append('../../')\n",
    "import scripts.sequence, scripts.viz, scripts.stats, scripts.motifs\n",
    "\n",
    "from plotnine import *\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0fa7391f-5310-4f37-9f71-a27ff096760a",
   "metadata": {},
   "source": [
    "## Read sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd1f805c-67f8-40c7-85a5-9c04dc4025da",
   "metadata": {},
   "outputs": [],
   "source": [
    "gen = pd.read_csv('synthetic_promoters/lm_filtered_pred.csv', index_col=0, usecols=(0,1,2), dtype={'label':'str'})\n",
    "gen['Group'] = 'regLM'\n",
    "gen.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4e248d0-ee92-436d-8f4b-4797264d33de",
   "metadata": {},
   "outputs": [],
   "source": [
    "seqs = pd.read_csv('synthetic_promoters/comparison.csv', index_col=0, usecols=(0,1,2))\n",
    "seqs.Group = seqs.Group.map({'DE':'Evolution', 'LE':'Ledidi'})\n",
    "seqs['label'] = '44'\n",
    "seqs.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0b2c840-e079-464c-be4f-4c53bbe531a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "real = pd.read_csv('lm_data/test.csv', index_col=0, usecols=(0,1,6), dtype={'label':'str'})\n",
    "real['Group'] = 'Test Set'\n",
    "real = real.rename(columns={'seq':'Sequence'})\n",
    "real.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9eeb8b9-95ed-4fce-a07c-9bb5846d84b4",
   "metadata": {},
   "source": [
    "## Combine all strong promoters for comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5155f4a8-6ed3-4f87-92c6-fa3b8575b11b",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = pd.concat([real, gen, seqs])\n",
    "all = all[['Sequence', 'Group', 'label']]\n",
    "all['SeqID'] = [str(x) for x in range(len(all))]\n",
    "all.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98007319-2149-44a7-bffd-34cf2227b70f",
   "metadata": {},
   "outputs": [],
   "source": [
    "all['Group'] = pd.Categorical(all.Group, categories=['Evolution', 'Ledidi', 'regLM', 'Test Set'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1792641-4faa-4d7c-b97c-bdaf2cb823ae",
   "metadata": {},
   "outputs": [],
   "source": [
    "all = all[all.label=='44']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c5a57d8d-9e30-4ad8-87f4-c6d952ae1078",
   "metadata": {},
   "source": [
    "## GC content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65e62958-5bf2-4438-91fc-d446e07bc3f3",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.viz.plot_gc_content(all)+\\\n",
    "annotate('text', x=1, y = .75, label='*', size=12)+\\\n",
    "annotate('text', x=2, y = .75, label='*', size=12)+\\\n",
    "theme(figure_size=(1.2,2))+\\\n",
    "theme(axis_text_x=element_text(rotation=70, hjust=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "880c0581-bc65-471f-9777-ba9b112caaf2",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(all, value_col='GC Content')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "927a8b39-1dc9-4191-8a96-c63b6e94ac59",
   "metadata": {},
   "source": [
    "## k-mer content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93030380-c305-4068-a908-9eafc316b9e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "kmers = scripts.sequence.kmer_frequencies(all.Sequence.tolist(), k=4, normalize=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3249338b-6858-4ec7-ac31-4db7b933ba91",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmers.obs = all.reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "488e5628-dd5c-4a00-a198-913bf5ef591c",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmer_diff = scripts.stats.diff_abundance(kmers, group_col='Group', reference='Test Set')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "71c7077c-160b-456a-9daf-5c1d137dc82e",
   "metadata": {},
   "source": [
    "## Distance metrics in k-mer space"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "126d40aa-5d7d-4af8-a476-cc5b4e86cd76",
   "metadata": {},
   "source": [
    "### 1-NN proportion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6ffc40a-87c8-4dc9-8d87-4b467f188631",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = scripts.sequence.one_nn(kmers, pca=False)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4400ddc-feb6-4425-9351-e6b7dd536680",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.proportion_between_groups(kmers.obs, value_col='one_nn_group', ref_value='Test Set')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f8fd3d0-8b9d-4f8e-9610-d1a1e6bddccb",
   "metadata": {},
   "source": [
    "### Distance to closest reference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "741ef934-e249-40ad-bb94-6b56202e163b",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmers = scripts.sequence.ref_dist(kmers, pca=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "595635a5-e457-49e6-940f-a6cfbf7261ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.viz.boxplot(kmers.obs, value_col='ref_dist')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4d01ea6-e2e5-4c3b-8d6e-3de4d1f8874f",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(kmers.obs, value_col='ref_dist')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21ef7b11-fb93-4dfb-a17f-8339d97a796c",
   "metadata": {},
   "source": [
    "### KNN distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3ae1712-4ff6-4b23-816b-6a37c34835b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmers_sample = anndata.concat([kmers[kmers.obs.Group!='Test Set'], \n",
    "                               kmers[kmers.obs[kmers.obs.Group=='Test Set'].sample(100).index, :]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "701bbb83-f90d-4096-ab61-72ec99679a3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmers_sample = scripts.sequence.within_group_knn_dist(kmers_sample, use_pca=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68c1baa6-4b62-49b6-8cd1-77f6b809a8be",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.viz.boxplot(kmers_sample.obs, value_col='KNN Distance')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d1ef107-0879-41fb-9077-460082cbfbd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(kmers_sample.obs, value_col='KNN Distance')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5d9e786a-7c44-432d-a370-8ace461c4b6a",
   "metadata": {},
   "source": [
    "## Motifs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f22397a4-395a-44af-8124-387ecef2eb20",
   "metadata": {},
   "outputs": [],
   "source": [
    "pwms = pd.read_hdf('/gstore/data/resbioai/lala8/yetfasco_1.02/pms.hdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2708956-ae8b-47ce-be76-796fa102ed51",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "sites = scripts.motifs.scan_seqs_motifs(all.set_index('SeqID'), pwms)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b4a4678-8cd0-4a6a-81c7-a0394d12d285",
   "metadata": {},
   "outputs": [],
   "source": [
    "cts = scripts.motifs.calculate_motif_counts(sites)\n",
    "cts.obs = cts.obs.merge(all.set_index('SeqID'), left_index=True, right_index=True, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c261814a-1799-4de9-b1a0-9bcdb9c9a9bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "sites.to_csv('synthetic_promoters/comparison_sites.csv')\n",
    "cts.write_h5ad('synthetic_promoters/comparison_site_cts.h5ad')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d9138695-1654-4b4e-9566-f182a1eb8656",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.diff_abundance(cts, 'Group', 'Test Set')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad8c5166-8f08-4a1b-9955-d9020792d760",
   "metadata": {},
   "source": [
    "## Distance metrics in motif space"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8ae501b6-9a14-4cb6-bbdc-e3167cdee5d7",
   "metadata": {},
   "source": [
    "### 1-NN proportion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d200a48-109e-4c19-806a-fe95f0ee4d8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = scripts.sequence.one_nn(cts, pca=True)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5a8c39d-282b-40c2-9f30-4803c25db1ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.proportion_between_groups(cts.obs, value_col='one_nn_group', ref_value='Test Set')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "51658e51-68c6-4d4e-9ffc-03a3daf2c5dc",
   "metadata": {},
   "source": [
    "### Distance to closest reference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60afc4c6-49d6-4ac1-b26d-6f94d6bd1c54",
   "metadata": {},
   "outputs": [],
   "source": [
    "cts = scripts.sequence.ref_dist(cts, pca=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "705f5145-5f00-406e-bd15-c21eaff499d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.viz.boxplot(cts.obs, value_col='ref_dist')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89678251-7c83-4d64-8c53-b0ba165b9bc8",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(cts.obs, value_col='ref_dist')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e3e6bbf-1bca-4ff9-8c82-867c3635845e",
   "metadata": {},
   "source": [
    "### KNN Distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5482f631-7eea-4f3a-9f37-9765394a03a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "cts_sample = anndata.concat([cts[cts.obs.Group!='Test Set'], cts[cts.obs[cts.obs.Group=='Test Set'].sample(100).index, :]])\n",
    "cts_sample.obs.Group = pd.Categorical(cts_sample.obs.Group, categories=['Evolution', 'Ledidi', 'regLM', 'Test Set'])\n",
    "cts_sample = scripts.sequence.within_group_knn_dist(cts_sample, use_pca=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c1bd1444-9f0d-4a17-8a86-a88f7dfec51b",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.viz.boxplot(cts_sample.obs, value_col='KNN Distance') +\\\n",
    "annotate('text', x=1, y = 8.5, label='*', size=12) +\\\n",
    "annotate('text', x=2, y = 8.5, label='*', size=12)+\\\n",
    "theme(figure_size=(1.2,2))+\\\n",
    "theme(axis_text_x=element_text(rotation=60, hjust=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f74488b6-9707-4a9e-8400-4608377a8892",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(cts_sample.obs, value_col='KNN Distance')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ce22150-fd1d-466e-9eab-ca2a463f560a",
   "metadata": {},
   "source": [
    "## Motif clustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66fa5cd5-cc01-4ba8-b324-36c398dfb42d",
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.neighbors(cts)\n",
    "sc.tl.umap(cts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce154f55-c891-4a47-8d31-77104b771cd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "importlib.reload(scripts.viz)\n",
    "p1, p2, p3 = scripts.viz.cluster_dist(cts, resolution=.7, group_col='Group', ref_group='Test Set')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "368032e5-838a-480a-8bf7-b68803e20021",
   "metadata": {},
   "outputs": [],
   "source": [
    "p1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1449c6bd-c066-47cd-abfe-fde636ecef51",
   "metadata": {},
   "outputs": [],
   "source": [
    "p2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61e19297-80b7-48cf-b8da-ddc1ea1ecb86",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.proportion_between_groups(cts.obs, value_col='leiden', ref_value=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f235f48-f86b-4421-8d1e-829dbd3b566a",
   "metadata": {},
   "source": [
    "## Motif abundance in strong vs. weak"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0bb44765-8afc-4c5f-a060-245953c75941",
   "metadata": {},
   "outputs": [],
   "source": [
    "act = ['SPT15_2172_0', 'PUT3_2223_0', 'GAL4_2126_0', 'HAA1_1425_0', 'PDR3_1387_0', 'NDT80_2145_0', 'MBP1_500_0', 'RSC3_2165_0', 'ADR1_623_0', 'MSN2_1381_0']\n",
    "rep = [\"ASH1_28_0\", \"MOT3_193_0\", \"DOT6_557_0\",  \"MATALPHA2_2212_0\", \"DAL80_636_0\", \"ROX1_537_0\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ff47799-bc1f-4eb3-867c-bcebf442d0ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "sites.index = sites.index.astype(str)\n",
    "sites = sites[(sites.index.isin(cts.obs.index)) & (sites.Matrix_id.isin(act))]\n",
    "sites.pos = sites.pos.astype(int)\n",
    "sites = sites.merge(cts.obs[['Group']], left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73cb5b12-449e-4f81-b955-46f95d68d64a",
   "metadata": {},
   "outputs": [],
   "source": [
    "for m in act:\n",
    "    print(m, [len(sites.loc[(sites.Group==g) & (sites.Matrix_id==m)]) for g in sites.Group.unique()])\n",
    "    df=scripts.stats.kruskal_between_groups(sites[sites.Matrix_id==m], value_col='pos', dunn=False)\n",
    "    print(\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1175c0b5-fb4a-40b4-ae31-bbadeaceb591",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(sites[sites.Matrix_id=='NDT80_2145_0'], value_col='pos')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c94dde96-9fb4-4a12-8f7a-34558ddde201",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.stats.kruskal_between_groups(sites[sites.Matrix_id=='RSC3_2165_0'], value_col='pos')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d32daf9c-a7c7-4d53-863c-b860ced12374",
   "metadata": {},
   "outputs": [],
   "source": [
    "sites['Method'] = sites.Group.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3da3ac32-fa56-4e0b-9c6e-4df59d24756e",
   "metadata": {},
   "outputs": [],
   "source": [
    "to_plot = sites[sites.Matrix_id.isin(['RSC3_2165_0', 'NDT80_2145_0'])]\n",
    "to_plot['matrix'] = [x.split('_')[0] for x in to_plot.Matrix_id]\n",
    "(\n",
    "  ggplot(to_plot, aes(x='pos', color='Method')) \n",
    "    + geom_density() \n",
    "    + facet_wrap('matrix', ncol=2)\n",
    "    + xlab(\"Position\")\n",
    "    + theme_classic()\n",
    "    + theme(figure_size=(3.8,2))\n",
    "    + guides(color = guide_legend(override_aes = {'size':.8}))\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0cdf204-b491-41b4-b5c4-ba85d4bbaece",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    ggplot(sites[sites.Matrix_id.isin(act)], \n",
    "           aes(x='pos', color='Group')) \n",
    "    + geom_density() \n",
    "    + facet_wrap('Matrix_id', ncol=4, scales='free_y')\n",
    "    + xlab(\"Start position\")\n",
    "    + theme_classic()\n",
    "    + theme(figure_size=(9,6))\n",
    "    + guides(color = guide_legend(override_aes = {'size':1}))\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "54e39032-2da2-4eba-aedb-845cec67e4f1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
