{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "35c04660-9c1d-4cd0-9b78-feb5c824e347",
   "metadata": {},
   "source": [
    "# Scan the test set promoters to identify activating and repressing TF motifs for further analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41ba0a70-b824-49b9-9660-958fb937488b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sys\n",
    "sys.path.append('../../')\n",
    "import scripts.motifs, scripts.stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac8e5856-a35f-4772-b9c9-48abe73ee2df",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = pd.read_csv('lm_data/test.csv', index_col=0, usecols=(0, 1, 6), dtype='str')\n",
    "test = test.rename(columns={'seq':'Sequence'})\n",
    "test['SeqID'] = ['seq' + str(x) for x in range(len(test))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "562d5e77-b900-433a-81e3-7c7081c01fea",
   "metadata": {},
   "outputs": [],
   "source": [
    "test = test[test.label.isin(['00', '11', '22', '33', '44'])]\n",
    "test.label.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "171aa6ce-9d98-4ce1-8e48-a2eec0b9fc88",
   "metadata": {},
   "source": [
    "## Scan with motifs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19cf7738-a634-49b1-b0be-010c951fd6d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "pwms = pd.read_hdf('/gstore/data/resbioai/lala8/yetfasco_1.02/pms.hdf')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fb82ddbf-d18c-420c-a872-f262343144a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "sites = scripts.motifs.scan_seqs_motifs(test, pwms, num_workers=8)\n",
    "cts = scripts.motifs.calculate_motif_counts(sites)\n",
    "cts.obs =  cts.obs.merge(test, left_index=True, right_index=True, how='left')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e88e8ba3-f7c4-48b3-b172-ba6707005665",
   "metadata": {},
   "source": [
    "## Test for differential abundance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b314d34-433d-4fd9-95dc-7929bcb71aea",
   "metadata": {},
   "outputs": [],
   "source": [
    "diff_motif_df = scripts.stats.diff_abundance(cts, group_col='label', reference='00')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43e930a2-130f-430a-a61f-d190f5f278e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "diff_motif_df[diff_motif_df.label=='44'].head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1f77f36-3ade-4a57-a70a-84e0c579f7a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "diff_motif_df[diff_motif_df.label=='44'].tail(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ed1c1a6-1692-4c74-93e8-0b5c7465e45b",
   "metadata": {},
   "outputs": [],
   "source": [
    "sites.to_csv('lm_data/test_sites.csv')\n",
    "cts.write_h5ad('lm_data/test_site_cts.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4694b27b-dff6-4d65-8fb8-d45c8f6b2323",
   "metadata": {},
   "source": [
    "## Number of motifs vs. label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf9c72c2-6ec8-4a38-a6b8-b4a4396da741",
   "metadata": {},
   "outputs": [],
   "source": [
    "n_sites = pd.concat([\n",
    "    pd.DataFrame({'label':x,'n':cts[cts.obs.label==x].X.sum(1)}) for x in [\"00\", \"11\", \"22\", \"33\", \"44\"]\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13ae456e-ce2b-41d7-92d3-4004b46ff079",
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.stats import mannwhitneyu\n",
    "mannwhitneyu(n_sites.n[n_sites.label=='44'].tolist(), \n",
    "             n_sites.n[n_sites.label=='33'].tolist(), alternative='greater')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2538e4e-0865-4c26-8878-e3a34e19077f",
   "metadata": {},
   "outputs": [],
   "source": [
    "(\n",
    " ggplot(n_sites, aes(x='label', y='n')) \n",
    "    + geom_boxplot(outlier_size=.1)\n",
    "    +ylab(\"Number of motifs per sequence\")\n",
    "    + xlab(\"Label\")\n",
    "    +theme_classic()\n",
    "    + annotate('text', x=2, y = 170, label='p(44 vs. 33) = 1e-145', size=9)\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
