{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys, os\n",
    "module_path = os.path.abspath(os.path.join('..'))\n",
    "if module_path not in sys.path:\n",
    "    sys.path.append(module_path)\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import scanpy as sc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We load the single-cell data using scanpy. The single-cell data is stored in a special data structure called AnnData (short: adata)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/anaconda3/envs/vci-env/lib/python3.9/site-packages/anndata/_core/anndata.py:1785: FutureWarning: X.dtype being converted to np.float32 from float64. In the next version of anndata (0.9) conversion will not be automatic. Pass dtype explicitly to avoid this warning. Pass `AnnData(X, dtype=X.dtype, ...)` to get the future behavour.\n",
      "  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "AnnData object with n_obs × n_vars = 581777 × 58347\n",
       "    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch'\n",
       "    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1'"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adatas = []\n",
    "for i in range(5):\n",
    "    adatas.append(sc.read(f'sciplex_raw_chunk_{i}.h5ad'))\n",
    "adata = adatas[0].concatenate(adatas[1:])\n",
    "adata"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The counts are stored in adata.X\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<581777x58347 sparse matrix of type '<class 'numpy.float32'>'\n",
       "\twith 761621411 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "adata.obs is a dataframe containing annotation for each cell, such as e.g. batch, cell type, perturbation. Or other technical annotations at the cell level."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cell_type</th>\n",
       "      <th>dose</th>\n",
       "      <th>dose_character</th>\n",
       "      <th>dose_pattern</th>\n",
       "      <th>g1s_score</th>\n",
       "      <th>g2m_score</th>\n",
       "      <th>pathway</th>\n",
       "      <th>pathway_level_1</th>\n",
       "      <th>pathway_level_2</th>\n",
       "      <th>product_dose</th>\n",
       "      <th>product_name</th>\n",
       "      <th>proliferation_index</th>\n",
       "      <th>replicate</th>\n",
       "      <th>size_factor</th>\n",
       "      <th>target</th>\n",
       "      <th>vehicle</th>\n",
       "      <th>batch</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A01_E09_RT_BC_100_Lig_BC_245-0-0-0</th>\n",
       "      <td>A549</td>\n",
       "      <td>1000.0</td>\n",
       "      <td>1000</td>\n",
       "      <td>2</td>\n",
       "      <td>1.155964</td>\n",
       "      <td>2.475312</td>\n",
       "      <td>TGF-beta/Smad</td>\n",
       "      <td>PKC signaling</td>\n",
       "      <td>PKC activitiy</td>\n",
       "      <td>Enzastaurin (LY317615)_1000</td>\n",
       "      <td>Enzastaurin (LY317615)</td>\n",
       "      <td>2.643512</td>\n",
       "      <td>rep2</td>\n",
       "      <td>2.296651</td>\n",
       "      <td>PKC</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A01_E09_RT_BC_100_Lig_BC_306-0-0-0</th>\n",
       "      <td>A549</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.980748</td>\n",
       "      <td>DNA Damage</td>\n",
       "      <td>DNA damage &amp; DNA repair</td>\n",
       "      <td>Nucleotide analog</td>\n",
       "      <td>Raltitrexed_10</td>\n",
       "      <td>Raltitrexed</td>\n",
       "      <td>1.980748</td>\n",
       "      <td>rep2</td>\n",
       "      <td>0.480141</td>\n",
       "      <td>DNA/RNA Synthesis</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A01_E09_RT_BC_101_Lig_BC_109-0-0-0</th>\n",
       "      <td>A549</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>Vehicle</td>\n",
       "      <td>Vehicle</td>\n",
       "      <td>Vehicle</td>\n",
       "      <td>Vehicle_0</td>\n",
       "      <td>Vehicle</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>rep1</td>\n",
       "      <td>0.516561</td>\n",
       "      <td>Vehicle</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A01_E09_RT_BC_101_Lig_BC_229-0-0-0</th>\n",
       "      <td>A549</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>1.817254</td>\n",
       "      <td>2.801225</td>\n",
       "      <td>Apoptosis</td>\n",
       "      <td>Protein folding &amp; Protein degradation</td>\n",
       "      <td>E3 ubiquitin ligase activity</td>\n",
       "      <td>Lenalidomide (CC-5013)_10</td>\n",
       "      <td>Lenalidomide (CC-5013)</td>\n",
       "      <td>3.073606</td>\n",
       "      <td>rep2</td>\n",
       "      <td>0.387978</td>\n",
       "      <td>TNF-alpha</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A01_E09_RT_BC_101_Lig_BC_280-0-0-0</th>\n",
       "      <td>A549</td>\n",
       "      <td>1000.0</td>\n",
       "      <td>1000</td>\n",
       "      <td>2</td>\n",
       "      <td>1.637016</td>\n",
       "      <td>0.867074</td>\n",
       "      <td>Ubiquitin</td>\n",
       "      <td>Epigenetic regulation</td>\n",
       "      <td>Histone deacetylation</td>\n",
       "      <td>Divalproex Sodium_1000</td>\n",
       "      <td>Divalproex Sodium</td>\n",
       "      <td>1.874835</td>\n",
       "      <td>rep2</td>\n",
       "      <td>0.724671</td>\n",
       "      <td>HDAC</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>H12_F10_RT_BC_99_Lig_BC_172-1-4</th>\n",
       "      <td>K562</td>\n",
       "      <td>1000.0</td>\n",
       "      <td>1000</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.410709</td>\n",
       "      <td>Angiogenesis</td>\n",
       "      <td>Tyrosine kinase signaling</td>\n",
       "      <td>RTK activity</td>\n",
       "      <td>PD173074_1000</td>\n",
       "      <td>PD173074</td>\n",
       "      <td>1.410709</td>\n",
       "      <td>rep1</td>\n",
       "      <td>0.645399</td>\n",
       "      <td>FGFR,VEGFR</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>H12_F10_RT_BC_99_Lig_BC_226-1-4</th>\n",
       "      <td>K562</td>\n",
       "      <td>100.0</td>\n",
       "      <td>100</td>\n",
       "      <td>3</td>\n",
       "      <td>1.361383</td>\n",
       "      <td>1.361383</td>\n",
       "      <td>Epigenetics</td>\n",
       "      <td>JAK/STAT signaling</td>\n",
       "      <td>JAK kinase activity</td>\n",
       "      <td>Baricitinib (LY3009104, INCB028050)_100</td>\n",
       "      <td>Baricitinib (LY3009104, INCB028050)</td>\n",
       "      <td>1.917388</td>\n",
       "      <td>rep1</td>\n",
       "      <td>0.689279</td>\n",
       "      <td>JAK</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>H12_F10_RT_BC_99_Lig_BC_337-1-4</th>\n",
       "      <td>K562</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>1.956512</td>\n",
       "      <td>2.080587</td>\n",
       "      <td>Epigenetics</td>\n",
       "      <td>Epigenetic regulation</td>\n",
       "      <td>Histone deacetylation</td>\n",
       "      <td>Entinostat (MS-275)_10</td>\n",
       "      <td>Entinostat (MS-275)</td>\n",
       "      <td>2.645024</td>\n",
       "      <td>rep2</td>\n",
       "      <td>2.140055</td>\n",
       "      <td>HDAC</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>H12_F10_RT_BC_9_Lig_BC_21-1-4</th>\n",
       "      <td>K562</td>\n",
       "      <td>10000.0</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>0.929037</td>\n",
       "      <td>1.402202</td>\n",
       "      <td>Epigenetics</td>\n",
       "      <td>Epigenetic regulation</td>\n",
       "      <td>Histone deacetylation</td>\n",
       "      <td>Resveratrol_10000</td>\n",
       "      <td>Resveratrol</td>\n",
       "      <td>1.722089</td>\n",
       "      <td>rep1</td>\n",
       "      <td>0.652712</td>\n",
       "      <td>Autophagy,Sirtuin</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>H12_F10_RT_BC_9_Lig_BC_274-1-4</th>\n",
       "      <td>K562</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>1.601367</td>\n",
       "      <td>1.783351</td>\n",
       "      <td>Endocrinology &amp; Hormones</td>\n",
       "      <td>Nuclear receptor signaling</td>\n",
       "      <td>Nuclear receptor activity</td>\n",
       "      <td>Andarine_10</td>\n",
       "      <td>Andarine</td>\n",
       "      <td>2.293501</td>\n",
       "      <td>rep1</td>\n",
       "      <td>1.010150</td>\n",
       "      <td>Androgen Receptor</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>581777 rows × 17 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   cell_type     dose dose_character  \\\n",
       "index                                                                  \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0      A549   1000.0           1000   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0      A549     10.0             10   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0      A549      0.0              0   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0      A549     10.0             10   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0      A549   1000.0           1000   \n",
       "...                                      ...      ...            ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4         K562   1000.0           1000   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4         K562    100.0            100   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4         K562     10.0             10   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4           K562  10000.0          10000   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4          K562     10.0             10   \n",
       "\n",
       "                                   dose_pattern  g1s_score  g2m_score  \\\n",
       "index                                                                   \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0            2   1.155964   2.475312   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0            4   0.000000   1.980748   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0            3   0.000000   0.000000   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0            4   1.817254   2.801225   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0            2   1.637016   0.867074   \n",
       "...                                         ...        ...        ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4               2   0.000000   1.410709   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4               3   1.361383   1.361383   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4               4   1.956512   2.080587   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4                 1   0.929037   1.402202   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4                4   1.601367   1.783351   \n",
       "\n",
       "                                                     pathway  \\\n",
       "index                                                          \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0             TGF-beta/Smad   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0                DNA Damage   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0                   Vehicle   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0                 Apoptosis   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0                 Ubiquitin   \n",
       "...                                                      ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4                 Angiogenesis   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4                  Epigenetics   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4                  Epigenetics   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4                    Epigenetics   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4      Endocrinology & Hormones   \n",
       "\n",
       "                                                          pathway_level_1  \\\n",
       "index                                                                       \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0                          PKC signaling   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0                DNA damage & DNA repair   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0                                Vehicle   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0  Protein folding & Protein degradation   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0                  Epigenetic regulation   \n",
       "...                                                                   ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4                 Tyrosine kinase signaling   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4                        JAK/STAT signaling   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4                     Epigenetic regulation   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4                       Epigenetic regulation   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4                 Nuclear receptor signaling   \n",
       "\n",
       "                                                 pathway_level_2  \\\n",
       "index                                                              \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0                 PKC activitiy   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0             Nucleotide analog   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0                       Vehicle   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0  E3 ubiquitin ligase activity   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0         Histone deacetylation   \n",
       "...                                                          ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4                     RTK activity   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4              JAK kinase activity   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4            Histone deacetylation   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4              Histone deacetylation   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4         Nuclear receptor activity   \n",
       "\n",
       "                                                               product_dose  \\\n",
       "index                                                                         \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0              Enzastaurin (LY317615)_1000   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0                           Raltitrexed_10   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0                                Vehicle_0   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0                Lenalidomide (CC-5013)_10   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0                   Divalproex Sodium_1000   \n",
       "...                                                                     ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4                               PD173074_1000   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4     Baricitinib (LY3009104, INCB028050)_100   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4                      Entinostat (MS-275)_10   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4                             Resveratrol_10000   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4                                  Andarine_10   \n",
       "\n",
       "                                                           product_name  \\\n",
       "index                                                                     \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0               Enzastaurin (LY317615)   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0                          Raltitrexed   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0                              Vehicle   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0               Lenalidomide (CC-5013)   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0                    Divalproex Sodium   \n",
       "...                                                                 ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4                                PD173074   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4     Baricitinib (LY3009104, INCB028050)   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4                     Entinostat (MS-275)   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4                               Resveratrol   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4                                 Andarine   \n",
       "\n",
       "                                    proliferation_index replicate  \\\n",
       "index                                                               \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0             2.643512      rep2   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0             1.980748      rep2   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0             0.000000      rep1   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0             3.073606      rep2   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0             1.874835      rep2   \n",
       "...                                                 ...       ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4                1.410709      rep1   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4                1.917388      rep1   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4                2.645024      rep2   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4                  1.722089      rep1   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4                 2.293501      rep1   \n",
       "\n",
       "                                    size_factor             target  vehicle  \\\n",
       "index                                                                         \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0     2.296651                PKC        0   \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0     0.480141  DNA/RNA Synthesis        0   \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0     0.516561            Vehicle        1   \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0     0.387978          TNF-alpha        0   \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0     0.724671               HDAC        0   \n",
       "...                                         ...                ...      ...   \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4        0.645399         FGFR,VEGFR        0   \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4        0.689279                JAK        0   \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4        2.140055               HDAC        0   \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4          0.652712  Autophagy,Sirtuin        0   \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4         1.010150  Androgen Receptor        0   \n",
       "\n",
       "                                   batch  \n",
       "index                                     \n",
       "A01_E09_RT_BC_100_Lig_BC_245-0-0-0     0  \n",
       "A01_E09_RT_BC_100_Lig_BC_306-0-0-0     0  \n",
       "A01_E09_RT_BC_101_Lig_BC_109-0-0-0     0  \n",
       "A01_E09_RT_BC_101_Lig_BC_229-0-0-0     0  \n",
       "A01_E09_RT_BC_101_Lig_BC_280-0-0-0     0  \n",
       "...                                  ...  \n",
       "H12_F10_RT_BC_99_Lig_BC_172-1-4        4  \n",
       "H12_F10_RT_BC_99_Lig_BC_226-1-4        4  \n",
       "H12_F10_RT_BC_99_Lig_BC_337-1-4        4  \n",
       "H12_F10_RT_BC_9_Lig_BC_21-1-4          4  \n",
       "H12_F10_RT_BC_9_Lig_BC_274-1-4         4  \n",
       "\n",
       "[581777 rows x 17 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "adata.var is a dataframe containing annotation for each gene, usually some statistics such as dispersion, or gene names, pathways etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>num_cells_expressed-0-0</th>\n",
       "      <th>num_cells_expressed-1-0</th>\n",
       "      <th>num_cells_expressed-1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>TSPAN6</th>\n",
       "      <td>ENSG00000000003.14</td>\n",
       "      <td>4017</td>\n",
       "      <td>13320</td>\n",
       "      <td>199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>TNMD</th>\n",
       "      <td>ENSG00000000005.5</td>\n",
       "      <td>3</td>\n",
       "      <td>14</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DPM1</th>\n",
       "      <td>ENSG00000000419.12</td>\n",
       "      <td>8442</td>\n",
       "      <td>72353</td>\n",
       "      <td>11231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SCYL3</th>\n",
       "      <td>ENSG00000000457.13</td>\n",
       "      <td>4386</td>\n",
       "      <td>24120</td>\n",
       "      <td>3380</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C1orf112</th>\n",
       "      <td>ENSG00000000460.16</td>\n",
       "      <td>5998</td>\n",
       "      <td>24454</td>\n",
       "      <td>9555</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AL591163.1</th>\n",
       "      <td>ENSG00000284744.1</td>\n",
       "      <td>8</td>\n",
       "      <td>26</td>\n",
       "      <td>27</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AL589702.1</th>\n",
       "      <td>ENSG00000284745.1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AC068587.10</th>\n",
       "      <td>ENSG00000284746.1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AL034417.4</th>\n",
       "      <td>ENSG00000284747.1</td>\n",
       "      <td>230</td>\n",
       "      <td>675</td>\n",
       "      <td>634</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>AL513220.1</th>\n",
       "      <td>ENSG00000284748.1</td>\n",
       "      <td>6</td>\n",
       "      <td>81</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>58347 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                             id  num_cells_expressed-0-0  \\\n",
       "index                                                      \n",
       "TSPAN6       ENSG00000000003.14                     4017   \n",
       "TNMD          ENSG00000000005.5                        3   \n",
       "DPM1         ENSG00000000419.12                     8442   \n",
       "SCYL3        ENSG00000000457.13                     4386   \n",
       "C1orf112     ENSG00000000460.16                     5998   \n",
       "...                         ...                      ...   \n",
       "AL591163.1    ENSG00000284744.1                        8   \n",
       "AL589702.1    ENSG00000284745.1                        2   \n",
       "AC068587.10   ENSG00000284746.1                        0   \n",
       "AL034417.4    ENSG00000284747.1                      230   \n",
       "AL513220.1    ENSG00000284748.1                        6   \n",
       "\n",
       "             num_cells_expressed-1-0  num_cells_expressed-1  \n",
       "index                                                        \n",
       "TSPAN6                         13320                    199  \n",
       "TNMD                              14                      3  \n",
       "DPM1                           72353                  11231  \n",
       "SCYL3                          24120                   3380  \n",
       "C1orf112                       24454                   9555  \n",
       "...                              ...                    ...  \n",
       "AL591163.1                        26                     27  \n",
       "AL589702.1                         3                      0  \n",
       "AC068587.10                        0                      0  \n",
       "AL034417.4                       675                    634  \n",
       "AL513220.1                        81                     16  \n",
       "\n",
       "[58347 rows x 4 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.var"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Quality control\n",
    "Check the quality of the data and remove some cells."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['n_counts'] = np.ravel(adata.X.sum(1)) #number of counts in the cell\n",
    "adata.obs['n_genes'] = np.ravel(np.sum(adata.X > 0, axis=1)) #number of genes with at least 1 count per cell\n",
    "adata.var['mito'] = adata.var_names.str.contains(\"MT-\") #flag for mitochondrial genes\n",
    "adata.obs['mt_frac'] = np.ravel(adata.X[:, adata.var.mito].sum(1)) / adata.obs['n_counts'].values #fraction of mitochondrial gene exp, high values mean dead or bad quality cells"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "View of AnnData object with n_obs × n_vars = 244556 × 58347\n",
       "    obs: 'cell_type', 'dose', 'dose_character', 'dose_pattern', 'g1s_score', 'g2m_score', 'pathway', 'pathway_level_1', 'pathway_level_2', 'product_dose', 'product_name', 'proliferation_index', 'replicate', 'size_factor', 'target', 'vehicle', 'batch', 'n_counts', 'n_genes', 'mt_frac'\n",
       "    var: 'id', 'num_cells_expressed-0-0', 'num_cells_expressed-1-0', 'num_cells_expressed-1', 'mito'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# filtering\n",
    "adata = adata[adata.obs['n_counts'] > 500]\n",
    "adata = adata[adata.obs['n_genes'] > 750]\n",
    "adata = adata[adata.obs['mt_frac'] < 0.2]\n",
    "adata"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8555.0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.X.max() #check it's an int, to make sure it's count data and not preprocessed data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.subsample(adata, fraction=0.5, random_state=0)\n",
    "\n",
    "sc.pp.normalize_per_cell(adata)\n",
    "sc.pp.log1p(adata)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature (gene) selection\n",
    "We select only the top N most variable genes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Out-of-distribution selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "drugs = adata.obs.product_name.unique()\n",
    "drugs = drugs[~np.isin(drugs, ['Vehicle'])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = []\n",
    "for cond1 in drugs:\n",
    "    ad1 = adata[adata.obs.product_name == cond1]\n",
    "    ad2 = adata[adata.obs.product_name != cond1]\n",
    "    mean1 = ad1.X.mean(0)\n",
    "    mean2 = ad2.X.mean(0)\n",
    "    l2 = np.linalg.norm(mean1-mean2)\n",
    "    results.append({\n",
    "        'cond1': cond1,\n",
    "        'L2': l2\n",
    "    })\n",
    "df_vs_rest = pd.DataFrame(results)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Pick biggest signals"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Dasatinib', 'Mocetinostat (MGCD0103)', 'Belinostat (PXD101)',\n",
       "       'Flavopiridol HCl', 'Patupilone (EPO906, Epothilone B)', '(+)-JQ1',\n",
       "       'Tanespimycin (17-AAG)', 'Trametinib (GSK1120212)',\n",
       "       'Alvespimycin (17-DMAG) HCl', 'Givinostat (ITF2357)',\n",
       "       'Pracinostat (SB939)', 'Raltitrexed', 'Hesperadin', 'AR-42',\n",
       "       'Abexinostat (PCI-24781)', 'CUDC-907', 'Dacinostat (LAQ824)',\n",
       "       'Panobinostat (LBH589)', 'YM155 (Sepantronium Bromide)',\n",
       "       'Quisinostat (JNJ-26481585) 2HCl'], dtype=object)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drug_OOD = df_vs_rest.sort_values(by='L2').tail(20).cond1.values\n",
    "drug_OOD"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prepare for the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.uns['fields'] = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['perturbation'] = [x.split(' ')[0] for x in adata.obs['product_name']]\n",
    "adata.uns['fields']['perturbation'] = 'perturbation'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['control'] = [1 if x == 'Vehicle' else 0 for x in adata.obs['perturbation'].values]\n",
    "adata.uns['fields']['control'] = 'control'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.obs['dose'] = adata.obs['dose'].astype(float) / np.max(adata.obs['dose'].astype(float))\n",
    "adata.uns['fields']['dose'] = 'dose'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.uns['fields']['covariates'] = ['cell_type', 'replicate']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "del adata.uns['log1p']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split dataset\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "adata.obs['split'] = 'NA'\n",
    "adata.uns['fields']['split'] = 'split'\n",
    "\n",
    "adata.obs.loc[\n",
    "    (adata.obs['cell_type'] == 'MCF7') & (adata.obs['product_name'].isin(drug_OOD)),\n",
    "    'split'\n",
    "] = 'ood'\n",
    "\n",
    "idx = np.where(adata.obs['split']=='NA')[0]\n",
    "idx_train, idx_test = train_test_split(idx, test_size=0.2, random_state=42)\n",
    "\n",
    "adata.obs.iloc[idx_train, adata.obs.columns.get_loc('split')] = 'train'\n",
    "adata.obs.iloc[idx_test, adata.obs.columns.get_loc('split')] = 'test'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Rank DE genes (optional)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/anaconda3/envs/vci-env/lib/python3.9/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "# this will be done in main script if it's not done here\n",
    "\n",
    "cov_names = []\n",
    "for cov in adata.uns['fields']['covariates']:\n",
    "    cov_names.append(np.array(adata.obs[cov].values))\n",
    "cov_names = [\"_\".join(c) for c in zip(*cov_names)]\n",
    "adata.obs[\"cov_name\"] = cov_names\n",
    "\n",
    "cov_pert_names = []\n",
    "for i in range(len(adata)):\n",
    "    comb_name = (\n",
    "        f\"{adata.obs['cov_name'].values[i]}\"\n",
    "        f\"_{adata.obs[adata.uns['fields']['perturbation']].values[i]}\"\n",
    "    )\n",
    "    cov_pert_names.append(comb_name)\n",
    "adata.obs[\"cov_pert_name\"] = cov_pert_names\n",
    "\n",
    "import warnings\n",
    "\n",
    "from vci.utils.data_utils import rank_genes_groups\n",
    "\n",
    "with warnings.catch_warnings():\n",
    "    warnings.simplefilter(\"ignore\")\n",
    "    rank_genes_groups(adata,\n",
    "        groupby=\"cov_pert_name\",\n",
    "        reference=\"cov_name\",\n",
    "        control_key=\"control\"\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cell_type</th>\n",
       "      <th>dose</th>\n",
       "      <th>dose_character</th>\n",
       "      <th>dose_pattern</th>\n",
       "      <th>g1s_score</th>\n",
       "      <th>g2m_score</th>\n",
       "      <th>pathway</th>\n",
       "      <th>pathway_level_1</th>\n",
       "      <th>pathway_level_2</th>\n",
       "      <th>product_dose</th>\n",
       "      <th>...</th>\n",
       "      <th>vehicle</th>\n",
       "      <th>batch</th>\n",
       "      <th>n_counts</th>\n",
       "      <th>n_genes</th>\n",
       "      <th>mt_frac</th>\n",
       "      <th>perturbation</th>\n",
       "      <th>control</th>\n",
       "      <th>split</th>\n",
       "      <th>cov_name</th>\n",
       "      <th>cov_pert_name</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>index</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A05_F10_RT_BC_46_Lig_BC_56-1-0-1</th>\n",
       "      <td>MCF7</td>\n",
       "      <td>0.100</td>\n",
       "      <td>1000</td>\n",
       "      <td>2</td>\n",
       "      <td>2.952470</td>\n",
       "      <td>3.442190</td>\n",
       "      <td>JAK/STAT</td>\n",
       "      <td>JAK/STAT signaling</td>\n",
       "      <td>JAK kinase activity</td>\n",
       "      <td>WP1066_1000</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>3762.0</td>\n",
       "      <td>2122</td>\n",
       "      <td>0.103668</td>\n",
       "      <td>WP1066</td>\n",
       "      <td>0</td>\n",
       "      <td>test</td>\n",
       "      <td>MCF7_rep2</td>\n",
       "      <td>MCF7_rep2_WP1066</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>H06_F10_RT_BC_354_Lig_BC_124-1-0-3</th>\n",
       "      <td>MCF7</td>\n",
       "      <td>1.000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>2.730909</td>\n",
       "      <td>3.048589</td>\n",
       "      <td>JAK/STAT</td>\n",
       "      <td>JAK/STAT signaling</td>\n",
       "      <td>RTK activity</td>\n",
       "      <td>Cerdulatinib (PRT062070, PRT2070)_10000</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1763.0</td>\n",
       "      <td>1077</td>\n",
       "      <td>0.196256</td>\n",
       "      <td>Cerdulatinib</td>\n",
       "      <td>0</td>\n",
       "      <td>train</td>\n",
       "      <td>MCF7_rep2</td>\n",
       "      <td>MCF7_rep2_Cerdulatinib</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A10_E09_RT_BC_65_Lig_BC_376-0-0-0</th>\n",
       "      <td>A549</td>\n",
       "      <td>0.100</td>\n",
       "      <td>1000</td>\n",
       "      <td>2</td>\n",
       "      <td>0.823174</td>\n",
       "      <td>2.159443</td>\n",
       "      <td>Apoptosis</td>\n",
       "      <td>Apoptotic regulation</td>\n",
       "      <td>Mitochondria-mediated apoptosis</td>\n",
       "      <td>Obatoclax Mesylate (GX15-070)_1000</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1053.0</td>\n",
       "      <td>756</td>\n",
       "      <td>0.132953</td>\n",
       "      <td>Obatoclax</td>\n",
       "      <td>0</td>\n",
       "      <td>train</td>\n",
       "      <td>A549_rep1</td>\n",
       "      <td>A549_rep1_Obatoclax</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>F06_F10_RT_BC_186_Lig_BC_137-1-0-2</th>\n",
       "      <td>MCF7</td>\n",
       "      <td>0.010</td>\n",
       "      <td>100</td>\n",
       "      <td>3</td>\n",
       "      <td>2.429623</td>\n",
       "      <td>2.197367</td>\n",
       "      <td>Epigenetics</td>\n",
       "      <td>Epigenetic regulation</td>\n",
       "      <td>Histone methylation</td>\n",
       "      <td>Tazemetostat (EPZ-6438)_100</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>5374.0</td>\n",
       "      <td>2598</td>\n",
       "      <td>0.185895</td>\n",
       "      <td>Tazemetostat</td>\n",
       "      <td>0</td>\n",
       "      <td>test</td>\n",
       "      <td>MCF7_rep2</td>\n",
       "      <td>MCF7_rep2_Tazemetostat</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A06_F10_RT_BC_73_Lig_BC_81-1-3</th>\n",
       "      <td>K562</td>\n",
       "      <td>0.010</td>\n",
       "      <td>100</td>\n",
       "      <td>3</td>\n",
       "      <td>0.894844</td>\n",
       "      <td>3.423042</td>\n",
       "      <td>JAK/STAT</td>\n",
       "      <td>JAK/STAT signaling</td>\n",
       "      <td>JAK kinase activity</td>\n",
       "      <td>Ruxolitinib (INCB018424)_100</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>3024.0</td>\n",
       "      <td>1768</td>\n",
       "      <td>0.047619</td>\n",
       "      <td>Ruxolitinib</td>\n",
       "      <td>0</td>\n",
       "      <td>train</td>\n",
       "      <td>K562_rep1</td>\n",
       "      <td>K562_rep1_Ruxolitinib</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>B02_E09_RT_BC_55_Lig_BC_375-1-3</th>\n",
       "      <td>K562</td>\n",
       "      <td>1.000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>1.706056</td>\n",
       "      <td>2.348048</td>\n",
       "      <td>Cell Cycle</td>\n",
       "      <td>Cell cycle regulation</td>\n",
       "      <td>Aurora kinase activity</td>\n",
       "      <td>Barasertib (AZD1152-HQPA)_10000</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2427.0</td>\n",
       "      <td>1514</td>\n",
       "      <td>0.152040</td>\n",
       "      <td>Barasertib</td>\n",
       "      <td>0</td>\n",
       "      <td>train</td>\n",
       "      <td>K562_rep2</td>\n",
       "      <td>K562_rep2_Barasertib</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A01_E09_RT_BC_190_Lig_BC_369-1-0-1</th>\n",
       "      <td>MCF7</td>\n",
       "      <td>0.010</td>\n",
       "      <td>100</td>\n",
       "      <td>3</td>\n",
       "      <td>3.365521</td>\n",
       "      <td>2.396224</td>\n",
       "      <td>Apoptosis</td>\n",
       "      <td>Protein folding &amp; Protein degradation</td>\n",
       "      <td>E3 ubiquitin ligase activity</td>\n",
       "      <td>Lenalidomide (CC-5013)_100</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1267.0</td>\n",
       "      <td>889</td>\n",
       "      <td>0.168114</td>\n",
       "      <td>Lenalidomide</td>\n",
       "      <td>0</td>\n",
       "      <td>test</td>\n",
       "      <td>MCF7_rep1</td>\n",
       "      <td>MCF7_rep1_Lenalidomide</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>C12_E09_RT_BC_31_Lig_BC_273-0-0-0</th>\n",
       "      <td>A549</td>\n",
       "      <td>0.001</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>1.570435</td>\n",
       "      <td>2.207568</td>\n",
       "      <td>Protein Tyrosine Kinase</td>\n",
       "      <td>Tyrosine kinase signaling</td>\n",
       "      <td>RTK activity</td>\n",
       "      <td>Vandetanib (ZD6474)_10</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2826.0</td>\n",
       "      <td>1724</td>\n",
       "      <td>0.146497</td>\n",
       "      <td>Vandetanib</td>\n",
       "      <td>0</td>\n",
       "      <td>train</td>\n",
       "      <td>A549_rep2</td>\n",
       "      <td>A549_rep2_Vandetanib</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>H09_F10_RT_BC_315_Lig_BC_119-1-0-3</th>\n",
       "      <td>MCF7</td>\n",
       "      <td>0.001</td>\n",
       "      <td>10</td>\n",
       "      <td>4</td>\n",
       "      <td>1.360285</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>DNA Damage</td>\n",
       "      <td>Antioxidant</td>\n",
       "      <td>Antioxidant</td>\n",
       "      <td>Daphnetin_10</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1746.0</td>\n",
       "      <td>1117</td>\n",
       "      <td>0.168958</td>\n",
       "      <td>Daphnetin</td>\n",
       "      <td>0</td>\n",
       "      <td>train</td>\n",
       "      <td>MCF7_rep2</td>\n",
       "      <td>MCF7_rep2_Daphnetin</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>F12_F10_RT_BC_344_Lig_BC_220-1-0-3</th>\n",
       "      <td>MCF7</td>\n",
       "      <td>1.000</td>\n",
       "      <td>10000</td>\n",
       "      <td>1</td>\n",
       "      <td>2.309293</td>\n",
       "      <td>2.503580</td>\n",
       "      <td>Epigenetics</td>\n",
       "      <td>JAK/STAT signaling</td>\n",
       "      <td>JAK kinase activity</td>\n",
       "      <td>Baricitinib (LY3009104, INCB028050)_10000</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>5858.0</td>\n",
       "      <td>2789</td>\n",
       "      <td>0.171048</td>\n",
       "      <td>Baricitinib</td>\n",
       "      <td>0</td>\n",
       "      <td>train</td>\n",
       "      <td>MCF7_rep2</td>\n",
       "      <td>MCF7_rep2_Baricitinib</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>122278 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                   cell_type   dose dose_character  \\\n",
       "index                                                                \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1        MCF7  0.100           1000   \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3      MCF7  1.000          10000   \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0       A549  0.100           1000   \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2      MCF7  0.010            100   \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3          K562  0.010            100   \n",
       "...                                      ...    ...            ...   \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3         K562  1.000          10000   \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1      MCF7  0.010            100   \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0       A549  0.001             10   \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3      MCF7  0.001             10   \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3      MCF7  1.000          10000   \n",
       "\n",
       "                                   dose_pattern  g1s_score  g2m_score  \\\n",
       "index                                                                   \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1              2   2.952470   3.442190   \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3            1   2.730909   3.048589   \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0             2   0.823174   2.159443   \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2            3   2.429623   2.197367   \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3                3   0.894844   3.423042   \n",
       "...                                         ...        ...        ...   \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3               1   1.706056   2.348048   \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1            3   3.365521   2.396224   \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0             4   1.570435   2.207568   \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3            4   1.360285   0.000000   \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3            1   2.309293   2.503580   \n",
       "\n",
       "                                                    pathway  \\\n",
       "index                                                         \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1                   JAK/STAT   \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3                 JAK/STAT   \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0                 Apoptosis   \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2              Epigenetics   \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3                     JAK/STAT   \n",
       "...                                                     ...   \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3                  Cell Cycle   \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1                Apoptosis   \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0   Protein Tyrosine Kinase   \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3               DNA Damage   \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3              Epigenetics   \n",
       "\n",
       "                                                          pathway_level_1  \\\n",
       "index                                                                       \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1                       JAK/STAT signaling   \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3                     JAK/STAT signaling   \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0                    Apoptotic regulation   \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2                  Epigenetic regulation   \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3                         JAK/STAT signaling   \n",
       "...                                                                   ...   \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3                     Cell cycle regulation   \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1  Protein folding & Protein degradation   \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0               Tyrosine kinase signaling   \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3                            Antioxidant   \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3                     JAK/STAT signaling   \n",
       "\n",
       "                                                    pathway_level_2  \\\n",
       "index                                                                 \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1                JAK kinase activity   \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3                     RTK activity   \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0   Mitochondria-mediated apoptosis   \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2              Histone methylation   \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3                  JAK kinase activity   \n",
       "...                                                             ...   \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3              Aurora kinase activity   \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1     E3 ubiquitin ligase activity   \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0                      RTK activity   \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3                      Antioxidant   \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3              JAK kinase activity   \n",
       "\n",
       "                                                                 product_dose  \\\n",
       "index                                                                           \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1                                  WP1066_1000   \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3    Cerdulatinib (PRT062070, PRT2070)_10000   \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0          Obatoclax Mesylate (GX15-070)_1000   \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2                Tazemetostat (EPZ-6438)_100   \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3                   Ruxolitinib (INCB018424)_100   \n",
       "...                                                                       ...   \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3               Barasertib (AZD1152-HQPA)_10000   \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1                 Lenalidomide (CC-5013)_100   \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0                      Vandetanib (ZD6474)_10   \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3                               Daphnetin_10   \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3  Baricitinib (LY3009104, INCB028050)_10000   \n",
       "\n",
       "                                    ... vehicle  batch n_counts  n_genes  \\\n",
       "index                               ...                                    \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1    ...       0      1   3762.0     2122   \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3  ...       0      3   1763.0     1077   \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0   ...       0      0   1053.0      756   \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2  ...       0      2   5374.0     2598   \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3      ...       0      3   3024.0     1768   \n",
       "...                                 ...     ...    ...      ...      ...   \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3     ...       0      3   2427.0     1514   \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1  ...       0      1   1267.0      889   \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0   ...       0      0   2826.0     1724   \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3  ...       0      3   1746.0     1117   \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3  ...       0      3   5858.0     2789   \n",
       "\n",
       "                                     mt_frac  perturbation control  split  \\\n",
       "index                                                                       \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1    0.103668        WP1066       0   test   \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3  0.196256  Cerdulatinib       0  train   \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0   0.132953     Obatoclax       0  train   \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2  0.185895  Tazemetostat       0   test   \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3      0.047619   Ruxolitinib       0  train   \n",
       "...                                      ...           ...     ...    ...   \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3     0.152040    Barasertib       0  train   \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1  0.168114  Lenalidomide       0   test   \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0   0.146497    Vandetanib       0  train   \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3  0.168958     Daphnetin       0  train   \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3  0.171048   Baricitinib       0  train   \n",
       "\n",
       "                                     cov_name           cov_pert_name  \n",
       "index                                                                  \n",
       "A05_F10_RT_BC_46_Lig_BC_56-1-0-1    MCF7_rep2        MCF7_rep2_WP1066  \n",
       "H06_F10_RT_BC_354_Lig_BC_124-1-0-3  MCF7_rep2  MCF7_rep2_Cerdulatinib  \n",
       "A10_E09_RT_BC_65_Lig_BC_376-0-0-0   A549_rep1     A549_rep1_Obatoclax  \n",
       "F06_F10_RT_BC_186_Lig_BC_137-1-0-2  MCF7_rep2  MCF7_rep2_Tazemetostat  \n",
       "A06_F10_RT_BC_73_Lig_BC_81-1-3      K562_rep1   K562_rep1_Ruxolitinib  \n",
       "...                                       ...                     ...  \n",
       "B02_E09_RT_BC_55_Lig_BC_375-1-3     K562_rep2    K562_rep2_Barasertib  \n",
       "A01_E09_RT_BC_190_Lig_BC_369-1-0-1  MCF7_rep1  MCF7_rep1_Lenalidomide  \n",
       "C12_E09_RT_BC_31_Lig_BC_273-0-0-0   A549_rep2    A549_rep2_Vandetanib  \n",
       "H09_F10_RT_BC_315_Lig_BC_119-1-0-3  MCF7_rep2     MCF7_rep2_Daphnetin  \n",
       "F12_F10_RT_BC_344_Lig_BC_220-1-0-3  MCF7_rep2   MCF7_rep2_Baricitinib  \n",
       "\n",
       "[122278 rows x 25 columns]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adata.obs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "adata.write('sciplex_prepped.h5ad')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.0 ('gvci-env')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.0"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "a18dafcc48613bb6c3783e5aa2cfbbab56400df7d9096bc06931f344217b23cf"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
