{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb8f257d-753a-446f-9b78-d4cf44c8bca3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sys\n",
    "import os\n",
    "import anndata\n",
    "import importlib\n",
    "\n",
    "sys.path.append('../../')\n",
    "import scripts.viz, scripts.stats, scripts.sequence, scripts.motifs\n",
    "\n",
    "from plotnine import *\n",
    "\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "542e566c-1adc-48c1-9695-fda9a821d4f6",
   "metadata": {},
   "source": [
    "## Predicted expression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ee224df-e369-446d-8e09-d2965cc30132",
   "metadata": {},
   "outputs": [],
   "source": [
    "seqs = pd.read_csv('synthetic_enhancers/comparison.csv', index_col=0, dtype={'label':'str'})\n",
    "seqs[['Group', 'label']].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "039d586c-dee3-429f-a79b-e7d51ab8b0b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "real = pd.read_csv('synthetic_enhancers/test_comparison.csv', index_col=0, usecols=(0,1,9, 10, 11, 12),\n",
    "                   dtype={'label':'str'})\n",
    "real.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "932973d1-4b5c-4705-ac11-fcde11281a91",
   "metadata": {},
   "outputs": [],
   "source": [
    "real['Group'] = 'Test Set'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e1fd921-2db7-4d75-991a-c08ae1c6389c",
   "metadata": {},
   "outputs": [],
   "source": [
    "seqs = pd.concat([seqs, real[['Sequence', 'label', 'Group', 'hpred', 'kpred', 'spred']]])\n",
    "seqs['SeqID'] = [str(x) for x in range(len(seqs))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e6d0a6a4-a59c-4525-a32e-79f6bae58540",
   "metadata": {},
   "outputs": [],
   "source": [
    "seqs = seqs[seqs.label.isin(['400', '040', '004'])].copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "279b0260-99c6-4480-b3d2-67bd888d45d9",
   "metadata": {},
   "source": [
    "## GC content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db4129a9-3298-479a-9ccd-bcc953ff51a4",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.viz.plot_gc_content(seqs, label_col='label')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d2ef4685-f302-4f6a-8b98-d8c98165e834",
   "metadata": {},
   "source": [
    "## k-mer content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5f0ad21-c1bb-4f4c-a9e0-0356b90a6492",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "kmers = scripts.sequence.kmer_frequencies(seqs.Sequence.tolist(), k=4, normalize=True)\n",
    "emb = anndata.AnnData(kmers)\n",
    "emb.obs = seqs.set_index('SeqID')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "999ba2c9-c4e6-49c6-a1f3-7bd94a42d446",
   "metadata": {},
   "outputs": [],
   "source": [
    "for label in ['400', '040', '004']:\n",
    "    print(label)\n",
    "    scripts.stats.diff_abundance(emb[emb.obs.label==label].copy(), group_col='Group', reference='Test Set')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "770e47b6-2a80-46e3-bc50-68252e91cb19",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmer_1nn = sequence.one_nn(emb, pca=False)\n",
    "kmer_1nn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d0219fb-bc3a-431b-ad6f-9c9b5bec36e1",
   "metadata": {},
   "outputs": [],
   "source": [
    "emb = sequence.ref_dist(emb, pca=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2fe2173e-52e2-492d-807c-9af6db424a6c",
   "metadata": {},
   "outputs": [],
   "source": [
    "viz.boxplot(emb.obs, value_col='ref_dist', group_col='Group') +\\\n",
    "theme(figure_size=(5, 3)) + facet_wrap('label')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "983b3b5b-7e36-47dd-9024-13e7f3c7f82e",
   "metadata": {},
   "outputs": [],
   "source": [
    "for label in ['004', '040', '400']:\n",
    "    print(label)\n",
    "    _ = scripts.stats.diff_abundance(emb[emb.obs.label==label], group_col='Group', \n",
    "                             reference='Test Set')\n",
    "    print(' ')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "225817b1-f856-404f-bbff-8f68da2ba7a1",
   "metadata": {},
   "source": [
    "## Motif content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "762e540d-0f7c-4fee-aba2-0888400e0b76",
   "metadata": {},
   "outputs": [],
   "source": [
    "pwms = pd.read_hdf('pms.hdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7118ee99-3820-4ea4-817b-b4512a8dd2a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "sites = scripts.motifs.scan_seqs_motifs(seqs.set_index('SeqID'), pwms)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a1ef4fb-df44-42e2-861c-bbc611271eef",
   "metadata": {},
   "outputs": [],
   "source": [
    "sites.to_csv('synthetic_enhancers/comparison_sites.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28d043b1-fc9e-4f4a-a81b-d78d54c0822f",
   "metadata": {},
   "outputs": [],
   "source": [
    "cts = reglm.motifs.calculate_motif_counts(sites)\n",
    "cts.obs = cts.obs.merge(seqs.set_index('SeqID'), left_index=True, right_index=True, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66f516ba-8f41-44fa-8fbe-c816ac11b1e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "for label in ['004', '040', '400']:\n",
    "    print(label)\n",
    "    _ = stats.diff_abundance(cts[cts.obs.label==label], group_col='Group', \n",
    "                             reference='Test Set')\n",
    "    print(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a4da630-5495-44c2-8b1c-d4606759cdfa",
   "metadata": {},
   "outputs": [],
   "source": [
    "motif_1nn = scripts.sequence.one_nn(cts, pca=True)\n",
    "motif_1nn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95a4ac1e-13bb-4c4a-b02b-f3e1fb64fcc2",
   "metadata": {},
   "outputs": [],
   "source": [
    "kmer_1nn = kmer_1nn.reset_index().melt(id_vars=['label', 'Group'], var_name='1NN_Method')\n",
    "kmer_1nn['Distance'] = 'k-mer frequency'\n",
    "motif_1nn = motif_1nn.reset_index().melt(id_vars=['label', 'Group'], var_name='1NN_Method')\n",
    "motif_1nn['Distance'] = 'Motif frequency (PCA)'\n",
    "one_nn = pd.concat([kmer_1nn, motif_1nn])\n",
    "one_nn = one_nn[one_nn['1NN_Method'] == 'Test Set']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d50f05c9-f3b5-4874-86f4-e7ace952fc91",
   "metadata": {},
   "outputs": [],
   "source": [
    "one_nn['Method'] = one_nn.Group.map({\n",
    "    'DE':'Evolution', 'LE':'Ledidi', 'regLM':'regLM', 'Test Set':'Test Set'})\n",
    "one_nn['Specificity'] = one_nn.label.map({\n",
    "    '400':'HepG2-specific', '040':'K562-specific', '004':'SKNSH-specific'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e9ad66f-2508-4a51-9758-57442c88acd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "one_nn['Method'] = pd.Categorical(one_nn.Method, categories=['Evolution', 'Ledidi', 'regLM', 'Test Set'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "afd67a5f-f6a6-4ad5-bd74-917a788a09fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    "    ggplot(one_nn, aes(y='value', x='Method', fill='Specificity')) + geom_col(position='dodge') +\n",
    "facet_wrap('Distance') + theme_classic() + theme(figure_size=(6,3))\n",
    "    + ylab('Fraction of sequences\\nwith validated nearest\\nneighbor')\n",
    "    + theme(axis_text_x=element_text(rotation=45, hjust=1))\n",
    "    +theme(figure_size=(5,2.4))\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "451ee572-ca0f-452c-9200-ce906ce616ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "cts = sequence.ref_dist(cts, pca=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c75e2817-f797-4bfc-9e4e-354bfa821a1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "viz.boxplot(cts.obs, value_col='ref_dist', group_col='Group') +\\\n",
    "theme(figure_size=(5, 3)) + facet_wrap('label')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30cbeb60-b314-4997-8c22-f89c56fa7734",
   "metadata": {},
   "outputs": [],
   "source": [
    "cts = sequence.within_group_knn_dist(cts, use_pca=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36d594de-f017-4255-afb4-d126c066bec6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "scripts.viz.boxplot(cts.obs, value_col='KNN Distance', group_col='Group') +\\\n",
    "theme(figure_size=(5, 3)) + facet_wrap('label')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "045e5385-e567-48fe-9f70-30d38179e787",
   "metadata": {},
   "source": [
    "## Plot motif abundance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fea381c4-0be7-4be9-bd54-5e33e41bb254",
   "metadata": {},
   "outputs": [],
   "source": [
    "cts = cts[cts.obs.Group == 'regLM',\n",
    "cts.var_names.isin([\n",
    "    'GATA1_MA0035.4', 'GATA2_MA0036.3',\n",
    "    'NFIX_MA0671.1', 'LHX5_MA1519.1',\n",
    "    'HNF1A_MA0046.2', 'HNF1B_MA0153.2', 'HNF4A_MA0114.4', 'HNF4G_MA0484.2',\n",
    "])]\n",
    "cts.var_names = [x.split('_')[0] for x in cts.var_names]\n",
    "cts.obs['Specificity'] = cts.obs.label.map({'400':'HepG2-specific', '040':'K562-specific', '004':'SKNSH-specific'})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09b6e752-c51a-41f1-9642-fa0bbe2d77ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "scripts.viz.plot_motif_freq_by_label(\n",
    "    cts, \n",
    "    motifs= {\n",
    "        'Neuronal': [\"NFIX\", 'LHX5'],\n",
    "        'Erythropoietic': ['GATA1', 'GATA2'],\n",
    "        'Liver': ['HNF1A', 'HNF1B', 'HNF4A', 'HNF4G'],\n",
    "    },\n",
    "    label_col='Specificity',\n",
    ")+theme(figure_size=(8, 2.1)) + ylab('Fraction of \\nsequences\\nwith motif')+\\\n",
    "theme(axis_title_y = element_text(lineheight = 1.3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "646b6df7-1abd-47dd-9244-69f5c69c78a5",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1bfe31ac-e833-4ee3-8005-dae1a34c91b8",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8005dc7e-ed11-4d2b-9116-1b9a3f4683d1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
