{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('../../code')\n",
    "\n",
    "from utils import binarize_log_data, remove_ambiguous_row, clean_continuous"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3899204/4181517134.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  slice = pd.read_csv('../../data/raw/chembl30_slice.csv')\n"
     ]
    }
   ],
   "source": [
    "slice = pd.read_csv('../../data/raw/chembl30_slice.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>compound_chembl_id</th>\n",
       "      <th>canonical_smiles</th>\n",
       "      <th>standard_type</th>\n",
       "      <th>standard_value</th>\n",
       "      <th>standard_units</th>\n",
       "      <th>standard_relation</th>\n",
       "      <th>assay_id</th>\n",
       "      <th>target_chembl_id</th>\n",
       "      <th>activity_comment</th>\n",
       "      <th>data_validity_comment</th>\n",
       "      <th>organism</th>\n",
       "      <th>pref_name</th>\n",
       "      <th>gene_symbol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>755</td>\n",
       "      <td>CHEMBL2</td>\n",
       "      <td>COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC</td>\n",
       "      <td>IC50</td>\n",
       "      <td>5.799727</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>725152</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>649</th>\n",
       "      <td>26406</td>\n",
       "      <td>CHEMBL8</td>\n",
       "      <td>O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O</td>\n",
       "      <td>IC50</td>\n",
       "      <td>4.522864</td>\n",
       "      <td>nM</td>\n",
       "      <td>&gt;</td>\n",
       "      <td>856195</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1753</th>\n",
       "      <td>44704</td>\n",
       "      <td>CHEMBL11</td>\n",
       "      <td>CN(C)CCCN1c2ccccc2CCc2ccccc21</td>\n",
       "      <td>IC50</td>\n",
       "      <td>5.469872</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>158540</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1755</th>\n",
       "      <td>44749</td>\n",
       "      <td>CHEMBL11</td>\n",
       "      <td>CN(C)CCCN1c2ccccc2CCc2ccccc21</td>\n",
       "      <td>IC50</td>\n",
       "      <td>5.469872</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>307283</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1757</th>\n",
       "      <td>44793</td>\n",
       "      <td>CHEMBL11</td>\n",
       "      <td>CN(C)CCCN1c2ccccc2CCc2ccccc21</td>\n",
       "      <td>IC50</td>\n",
       "      <td>5.469872</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>556369</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669374</th>\n",
       "      <td>16430847</td>\n",
       "      <td>CHEMBL4878766</td>\n",
       "      <td>CCC(NC(=O)c1ccc(Cl)cc1)[C@@H]1[C@H]2C[C@H](n3c...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>6.107349</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2133671</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669464</th>\n",
       "      <td>16431366</td>\n",
       "      <td>CHEMBL4878917</td>\n",
       "      <td>NC1CCN(c2c(-c3cc(F)cc(Cl)c3)cncc2-c2nc3cc(F)cc...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>5.309715</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2132048</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669502</th>\n",
       "      <td>16431587</td>\n",
       "      <td>CHEMBL4878988</td>\n",
       "      <td>CCC(NC(=O)c1ccc(Cl)cc1)[C@@H]1[C@H]2C[C@H](n3c...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>7.221849</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2133671</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669587</th>\n",
       "      <td>16432118</td>\n",
       "      <td>CHEMBL4879150</td>\n",
       "      <td>NCCn1nnc(-c2ccc(Oc3ccc(Cl)cc3)cc2)n1</td>\n",
       "      <td>IC50</td>\n",
       "      <td>5.070530</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2125318</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669600</th>\n",
       "      <td>16432191</td>\n",
       "      <td>CHEMBL4879173</td>\n",
       "      <td>CCC(NC(=O)c1ccc(Cl)cc1)[C@@H]1[C@H]2C[C@H](n3c...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>6.160522</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2133671</td>\n",
       "      <td>CHEMBL240</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>HERG</td>\n",
       "      <td>KCNH2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11159 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Unnamed: 0 compound_chembl_id  \\\n",
       "109             755            CHEMBL2   \n",
       "649           26406            CHEMBL8   \n",
       "1753          44704           CHEMBL11   \n",
       "1755          44749           CHEMBL11   \n",
       "1757          44793           CHEMBL11   \n",
       "...             ...                ...   \n",
       "2669374    16430847      CHEMBL4878766   \n",
       "2669464    16431366      CHEMBL4878917   \n",
       "2669502    16431587      CHEMBL4878988   \n",
       "2669587    16432118      CHEMBL4879150   \n",
       "2669600    16432191      CHEMBL4879173   \n",
       "\n",
       "                                          canonical_smiles standard_type  \\\n",
       "109          COc1cc2nc(N3CCN(C(=O)c4ccco4)CC3)nc(N)c2cc1OC          IC50   \n",
       "649             O=C(O)c1cn(C2CC2)c2cc(N3CCNCC3)c(F)cc2c1=O          IC50   \n",
       "1753                         CN(C)CCCN1c2ccccc2CCc2ccccc21          IC50   \n",
       "1755                         CN(C)CCCN1c2ccccc2CCc2ccccc21          IC50   \n",
       "1757                         CN(C)CCCN1c2ccccc2CCc2ccccc21          IC50   \n",
       "...                                                    ...           ...   \n",
       "2669374  CCC(NC(=O)c1ccc(Cl)cc1)[C@@H]1[C@H]2C[C@H](n3c...          IC50   \n",
       "2669464  NC1CCN(c2c(-c3cc(F)cc(Cl)c3)cncc2-c2nc3cc(F)cc...          IC50   \n",
       "2669502  CCC(NC(=O)c1ccc(Cl)cc1)[C@@H]1[C@H]2C[C@H](n3c...          IC50   \n",
       "2669587               NCCn1nnc(-c2ccc(Oc3ccc(Cl)cc3)cc2)n1          IC50   \n",
       "2669600  CCC(NC(=O)c1ccc(Cl)cc1)[C@@H]1[C@H]2C[C@H](n3c...          IC50   \n",
       "\n",
       "         standard_value standard_units standard_relation  assay_id  \\\n",
       "109            5.799727             nM                 =    725152   \n",
       "649            4.522864             nM                 >    856195   \n",
       "1753           5.469872             nM                 =    158540   \n",
       "1755           5.469872             nM                 =    307283   \n",
       "1757           5.469872             nM                 =    556369   \n",
       "...                 ...            ...               ...       ...   \n",
       "2669374        6.107349             nM                 =   2133671   \n",
       "2669464        5.309715             nM                 =   2132048   \n",
       "2669502        7.221849             nM                 =   2133671   \n",
       "2669587        5.070530             nM                 =   2125318   \n",
       "2669600        6.160522             nM                 =   2133671   \n",
       "\n",
       "        target_chembl_id activity_comment data_validity_comment      organism  \\\n",
       "109            CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "649            CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "1753           CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "1755           CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "1757           CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "...                  ...              ...                   ...           ...   \n",
       "2669374        CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "2669464        CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "2669502        CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "2669587        CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "2669600        CHEMBL240              NaN                   NaN  Homo sapiens   \n",
       "\n",
       "        pref_name gene_symbol  \n",
       "109          HERG       KCNH2  \n",
       "649          HERG       KCNH2  \n",
       "1753         HERG       KCNH2  \n",
       "1755         HERG       KCNH2  \n",
       "1757         HERG       KCNH2  \n",
       "...           ...         ...  \n",
       "2669374      HERG       KCNH2  \n",
       "2669464      HERG       KCNH2  \n",
       "2669502      HERG       KCNH2  \n",
       "2669587      HERG       KCNH2  \n",
       "2669600      HERG       KCNH2  \n",
       "\n",
       "[11159 rows x 14 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "slice['standard_value'] = slice['standard_value'].map(lambda value: -np.log10((max(value, 0) + 1)) + 9)\n",
    "slice = slice[slice['standard_type'] == 'IC50']\n",
    "kcnh2 = slice[slice['gene_symbol'] == 'KCNH2']\n",
    "kcnh2"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare KCNH2-Lo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/steshin/gero_benchmark/notebooks/data/../../code/utils.py:90: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
      "  continuous_clean = group.median()\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>smiles</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brc1ccc(Nc2ccc(CN3CCC4(CC3)OCCc3sccc34)cc2)cc1</td>\n",
       "      <td>5.370201</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brc1ccc2c(NC3=NC[C@@]4(CN5CCC4CC5)O3)ncnn12</td>\n",
       "      <td>5.601886</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Brc1cnc2nc(N3CCN4CCC3CC4)oc2c1</td>\n",
       "      <td>5.638083</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C#CCOc1cnc(C(=O)Nc2cc(F)c(F)c([C@@]3(C)N=C(N)S...</td>\n",
       "      <td>5.161088</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C#Cc1cnc(Nc2cnc(C#N)c(O[C@H](C)CN(C)C)n2)cc1NC</td>\n",
       "      <td>5.096856</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4577</th>\n",
       "      <td>c1cnc2c(N3CCN(CCc4ccc(OCCCN5CCCCCC5)cc4)CC3)cc...</td>\n",
       "      <td>5.099945</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4578</th>\n",
       "      <td>c1cncc(-c2c[nH]c([C@H]3Cc4c([nH]c5ccccc45)C(C4...</td>\n",
       "      <td>6.718967</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4579</th>\n",
       "      <td>c1cncc(-c2c[nH]c([C@H]3Cc4c([nH]c5ccccc45)[C@@...</td>\n",
       "      <td>5.568315</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4580</th>\n",
       "      <td>c1cncc(-c2ccc(-c3noc(C4CN5CCC4CC5)n3)o2)c1</td>\n",
       "      <td>5.193752</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4581</th>\n",
       "      <td>c1ncc(-c2cc3sc(N4CCC(N5CCCCC5)CC4)nc3cn2)cn1</td>\n",
       "      <td>5.194197</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4582 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 smiles     value\n",
       "0        Brc1ccc(Nc2ccc(CN3CCC4(CC3)OCCc3sccc34)cc2)cc1  5.370201\n",
       "1           Brc1ccc2c(NC3=NC[C@@]4(CN5CCC4CC5)O3)ncnn12  5.601886\n",
       "2                        Brc1cnc2nc(N3CCN4CCC3CC4)oc2c1  5.638083\n",
       "3     C#CCOc1cnc(C(=O)Nc2cc(F)c(F)c([C@@]3(C)N=C(N)S...  5.161088\n",
       "4        C#Cc1cnc(Nc2cnc(C#N)c(O[C@H](C)CN(C)C)n2)cc1NC  5.096856\n",
       "...                                                 ...       ...\n",
       "4577  c1cnc2c(N3CCN(CCc4ccc(OCCCN5CCCCCC5)cc4)CC3)cc...  5.099945\n",
       "4578  c1cncc(-c2c[nH]c([C@H]3Cc4c([nH]c5ccccc45)C(C4...  6.718967\n",
       "4579  c1cncc(-c2c[nH]c([C@H]3Cc4c([nH]c5ccccc45)[C@@...  5.568315\n",
       "4580         c1cncc(-c2ccc(-c3noc(C4CN5CCC4CC5)n3)o2)c1  5.193752\n",
       "4581       c1ncc(-c2cc3sc(N4CCC(N5CCCCC5)CC4)nc3cn2)cn1  5.194197\n",
       "\n",
       "[4582 rows x 2 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kcnh2_continuous = clean_continuous(kcnh2)\n",
    "kcnh2_lo = pd.DataFrame({\n",
    "    'smiles': kcnh2_continuous['canonical_smiles'],\n",
    "    'value': kcnh2_continuous['standard_value']\n",
    "}).reset_index(drop=True)\n",
    "kcnh2_lo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "kcnh2_lo.to_csv('../../data/raw/kcnh2_lo.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gero_benchmark",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
