{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('../../code')\n",
    "\n",
    "from utils import binarize_log_data, remove_ambiguous_row, clean_continuous"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_68753/4181517134.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  slice = pd.read_csv('../../data/raw/chembl30_slice.csv')\n"
     ]
    }
   ],
   "source": [
    "slice = pd.read_csv('../../data/raw/chembl30_slice.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>compound_chembl_id</th>\n",
       "      <th>canonical_smiles</th>\n",
       "      <th>standard_type</th>\n",
       "      <th>standard_value</th>\n",
       "      <th>standard_units</th>\n",
       "      <th>standard_relation</th>\n",
       "      <th>assay_id</th>\n",
       "      <th>target_chembl_id</th>\n",
       "      <th>activity_comment</th>\n",
       "      <th>data_validity_comment</th>\n",
       "      <th>organism</th>\n",
       "      <th>pref_name</th>\n",
       "      <th>gene_symbol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>368</th>\n",
       "      <td>6996</td>\n",
       "      <td>CHEMBL6246</td>\n",
       "      <td>O=c1oc2c(O)c(O)cc3c(=O)oc4c(O)c(O)cc1c4c23</td>\n",
       "      <td>IC50</td>\n",
       "      <td>6.101824</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>808259</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8128</th>\n",
       "      <td>113927</td>\n",
       "      <td>CHEMBL7724</td>\n",
       "      <td>Cc1cc2ncc(-c3ccccc3)nc2cc1C</td>\n",
       "      <td>IC50</td>\n",
       "      <td>5.207538</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>603363</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11424</th>\n",
       "      <td>183777</td>\n",
       "      <td>CHEMBL50</td>\n",
       "      <td>O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12</td>\n",
       "      <td>IC50</td>\n",
       "      <td>6.551294</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>1284294</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11480</th>\n",
       "      <td>184225</td>\n",
       "      <td>CHEMBL50</td>\n",
       "      <td>O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12</td>\n",
       "      <td>IC50</td>\n",
       "      <td>5.779630</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>1774202</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19701</th>\n",
       "      <td>350004</td>\n",
       "      <td>CHEMBL98</td>\n",
       "      <td>O=C(CCCCCCC(=O)Nc1ccccc1)NO</td>\n",
       "      <td>IC50</td>\n",
       "      <td>4.999957</td>\n",
       "      <td>nM</td>\n",
       "      <td>&gt;</td>\n",
       "      <td>1527399</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2668327</th>\n",
       "      <td>16425523</td>\n",
       "      <td>CHEMBL4877264</td>\n",
       "      <td>CC(=O)N1CCN([C@H]2C[C@@H](n3cc(-c4ccc(Cl)c(N)c...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>6.075204</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2125061</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2668516</th>\n",
       "      <td>16426271</td>\n",
       "      <td>CHEMBL4877505</td>\n",
       "      <td>CCCNc1cc(Oc2ccc(NC(=O)Nc3cccc(C(F)(F)F)c3)cc2)...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>7.698970</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2132287</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669245</th>\n",
       "      <td>16430243</td>\n",
       "      <td>CHEMBL4878552</td>\n",
       "      <td>Cc1ccc(NC(=O)Cn2cc(-c3ccc4c(NC(=O)c5cnn(C)c5)n...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>8.761954</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2111788</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669269</th>\n",
       "      <td>16430348</td>\n",
       "      <td>CHEMBL4878575</td>\n",
       "      <td>O=C(CSc1nc2ccccc2n2cnnc12)Nc1ccc(C(=O)NC2CCCCC...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>7.296709</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2102714</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669638</th>\n",
       "      <td>16432281</td>\n",
       "      <td>CHEMBL4879197</td>\n",
       "      <td>Nc1n[nH]c2cc(-c3cnn(CC(=O)Nc4ccc(Cl)c(C(F)(F)F...</td>\n",
       "      <td>IC50</td>\n",
       "      <td>7.886057</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2111788</td>\n",
       "      <td>CHEMBL279</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Vascular endothelial growth factor receptor 2</td>\n",
       "      <td>KDR</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8826 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Unnamed: 0 compound_chembl_id  \\\n",
       "368            6996         CHEMBL6246   \n",
       "8128         113927         CHEMBL7724   \n",
       "11424        183777           CHEMBL50   \n",
       "11480        184225           CHEMBL50   \n",
       "19701        350004           CHEMBL98   \n",
       "...             ...                ...   \n",
       "2668327    16425523      CHEMBL4877264   \n",
       "2668516    16426271      CHEMBL4877505   \n",
       "2669245    16430243      CHEMBL4878552   \n",
       "2669269    16430348      CHEMBL4878575   \n",
       "2669638    16432281      CHEMBL4879197   \n",
       "\n",
       "                                          canonical_smiles standard_type  \\\n",
       "368             O=c1oc2c(O)c(O)cc3c(=O)oc4c(O)c(O)cc1c4c23          IC50   \n",
       "8128                           Cc1cc2ncc(-c3ccccc3)nc2cc1C          IC50   \n",
       "11424           O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12          IC50   \n",
       "11480           O=c1c(O)c(-c2ccc(O)c(O)c2)oc2cc(O)cc(O)c12          IC50   \n",
       "19701                          O=C(CCCCCCC(=O)Nc1ccccc1)NO          IC50   \n",
       "...                                                    ...           ...   \n",
       "2668327  CC(=O)N1CCN([C@H]2C[C@@H](n3cc(-c4ccc(Cl)c(N)c...          IC50   \n",
       "2668516  CCCNc1cc(Oc2ccc(NC(=O)Nc3cccc(C(F)(F)F)c3)cc2)...          IC50   \n",
       "2669245  Cc1ccc(NC(=O)Cn2cc(-c3ccc4c(NC(=O)c5cnn(C)c5)n...          IC50   \n",
       "2669269  O=C(CSc1nc2ccccc2n2cnnc12)Nc1ccc(C(=O)NC2CCCCC...          IC50   \n",
       "2669638  Nc1n[nH]c2cc(-c3cnn(CC(=O)Nc4ccc(Cl)c(C(F)(F)F...          IC50   \n",
       "\n",
       "         standard_value standard_units standard_relation  assay_id  \\\n",
       "368            6.101824             nM                 =    808259   \n",
       "8128           5.207538             nM                 =    603363   \n",
       "11424          6.551294             nM                 =   1284294   \n",
       "11480          5.779630             nM                 =   1774202   \n",
       "19701          4.999957             nM                 >   1527399   \n",
       "...                 ...            ...               ...       ...   \n",
       "2668327        6.075204             nM                 =   2125061   \n",
       "2668516        7.698970             nM                 =   2132287   \n",
       "2669245        8.761954             nM                 =   2111788   \n",
       "2669269        7.296709             nM                 =   2102714   \n",
       "2669638        7.886057             nM                 =   2111788   \n",
       "\n",
       "        target_chembl_id activity_comment data_validity_comment      organism  \\\n",
       "368            CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "8128           CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "11424          CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "11480          CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "19701          CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "...                  ...              ...                   ...           ...   \n",
       "2668327        CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "2668516        CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "2669245        CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "2669269        CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "2669638        CHEMBL279              NaN                   NaN  Homo sapiens   \n",
       "\n",
       "                                             pref_name gene_symbol  \n",
       "368      Vascular endothelial growth factor receptor 2         KDR  \n",
       "8128     Vascular endothelial growth factor receptor 2         KDR  \n",
       "11424    Vascular endothelial growth factor receptor 2         KDR  \n",
       "11480    Vascular endothelial growth factor receptor 2         KDR  \n",
       "19701    Vascular endothelial growth factor receptor 2         KDR  \n",
       "...                                                ...         ...  \n",
       "2668327  Vascular endothelial growth factor receptor 2         KDR  \n",
       "2668516  Vascular endothelial growth factor receptor 2         KDR  \n",
       "2669245  Vascular endothelial growth factor receptor 2         KDR  \n",
       "2669269  Vascular endothelial growth factor receptor 2         KDR  \n",
       "2669638  Vascular endothelial growth factor receptor 2         KDR  \n",
       "\n",
       "[8826 rows x 14 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "slice['standard_value'] = slice['standard_value'].map(lambda value: -np.log10((max(value, 0) + 1)) + 9)\n",
    "slice = slice[slice['standard_type'] == 'IC50']\n",
    "kdr = slice[slice['gene_symbol'] == 'KDR']\n",
    "kdr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare KDR-Hi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>smiles</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brc1ccc(-c2nc3ccc(Nc4ncnc5ccccc45)cc3[nH]2)cc1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Brc1cccc(Nc2ncnc3ccc(NCc4ccc5c(c4)OCCCO5)cc23)c1</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C#CC(C)OC1=CC(=O)C(Nc2ncnc3cc(OCCCN4CCCC4)c(OC...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C#CC(OC1=CC(=O)C(Nc2ncnc3cc(OCCCN4CCCC4)c(OC)c...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6272</th>\n",
       "      <td>c1cn(Cc2ccc3c(c2)-c2[nH]nc(-c4ccsc4)c2C3)cn1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6273</th>\n",
       "      <td>c1cnc(CCc2ccncc2)c(-c2nnc(NCc3ccc4c(c3)OCO4)o2)c1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6274</th>\n",
       "      <td>c1cnc(CCc2ccncc2)c(-c2nnc(Nc3cnc4ccccc4c3)o2)c1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6275</th>\n",
       "      <td>c1cncc(-c2cnn3cc(-c4ccc(OCCN5CCCCC5)cc4)cnc23)c1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6276</th>\n",
       "      <td>c1csc(-c2n[nH]c3c2Cc2ccccc2-3)c1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6277 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 smiles  value\n",
       "0        Brc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1   True\n",
       "1        Brc1ccc(-c2nc3ccc(Nc4ncnc5ccccc45)cc3[nH]2)cc1   True\n",
       "2      Brc1cccc(Nc2ncnc3ccc(NCc4ccc5c(c4)OCCCO5)cc23)c1  False\n",
       "3     C#CC(C)OC1=CC(=O)C(Nc2ncnc3cc(OCCCN4CCCC4)c(OC...   True\n",
       "4     C#CC(OC1=CC(=O)C(Nc2ncnc3cc(OCCCN4CCCC4)c(OC)c...   True\n",
       "...                                                 ...    ...\n",
       "6272       c1cn(Cc2ccc3c(c2)-c2[nH]nc(-c4ccsc4)c2C3)cn1   True\n",
       "6273  c1cnc(CCc2ccncc2)c(-c2nnc(NCc3ccc4c(c3)OCO4)o2)c1   True\n",
       "6274    c1cnc(CCc2ccncc2)c(-c2nnc(Nc3cnc4ccccc4c3)o2)c1   True\n",
       "6275   c1cncc(-c2cnn3cc(-c4ccc(OCCN5CCCCC5)cc4)cnc23)c1   True\n",
       "6276                   c1csc(-c2n[nH]c3c2Cc2ccccc2-3)c1   True\n",
       "\n",
       "[6277 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kdr_binary = binarize_log_data(kdr, threshold=6)\n",
    "kdr_binary = remove_ambiguous_row(kdr_binary)\n",
    "kdr_hi = pd.DataFrame({\n",
    "    'smiles': kdr_binary.index.to_list(),\n",
    "    'value': kdr_binary['label'] > 0.5\n",
    "}).reset_index(drop=True)\n",
    "kdr_hi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "kdr_hi.to_csv('../../data/raw/kdr_hi.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare KDR-Lo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/simon/papers/lohi/notebooks/data/../../code/utils.py:85: FutureWarning: Dropping invalid columns in DataFrameGroupBy.min is deprecated. In a future version, a TypeError will be raised. Before calling .min, select only columns which should be valid for the function.\n",
      "  min_values = group.min()\n",
      "/home/simon/papers/lohi/notebooks/data/../../code/utils.py:86: FutureWarning: Dropping invalid columns in DataFrameGroupBy.max is deprecated. In a future version, a TypeError will be raised. Before calling .max, select only columns which should be valid for the function.\n",
      "  max_values = group.max()\n",
      "/home/simon/papers/lohi/notebooks/data/../../code/utils.py:90: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
      "  continuous_clean = group.median()\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>smiles</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1</td>\n",
       "      <td>6.742321</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brc1ccc(-c2nc3ccc(Nc4ncnc5ccccc45)cc3[nH]2)cc1</td>\n",
       "      <td>6.419075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>C#CC(C)OC1=CC(=O)C(Nc2ncnc3cc(OCCCN4CCCC4)c(OC...</td>\n",
       "      <td>7.432974</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>C#CC(OC1=CC(=O)C(Nc2ncnc3cc(OCCCN4CCCC4)c(OC)c...</td>\n",
       "      <td>8.108191</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>C#CCO/N=C/c1c(N)ncnc1Oc1ccc2[nH]c(C)cc2c1F</td>\n",
       "      <td>7.275724</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4890</th>\n",
       "      <td>c1cn(Cc2ccc3c(c2)-c2[nH]nc(-c4ccsc4)c2C3)cn1</td>\n",
       "      <td>6.680695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4891</th>\n",
       "      <td>c1cnc(CCc2ccncc2)c(-c2nnc(NCc3ccc4c(c3)OCO4)o2)c1</td>\n",
       "      <td>6.800804</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4892</th>\n",
       "      <td>c1cnc(CCc2ccncc2)c(-c2nnc(Nc3cnc4ccccc4c3)o2)c1</td>\n",
       "      <td>6.419075</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4893</th>\n",
       "      <td>c1cncc(-c2cnn3cc(-c4ccc(OCCN5CCCCC5)cc4)cnc23)c1</td>\n",
       "      <td>7.677781</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4894</th>\n",
       "      <td>c1csc(-c2n[nH]c3c2Cc2ccccc2-3)c1</td>\n",
       "      <td>6.031050</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4895 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 smiles     value\n",
       "0        Brc1ccc(-c2nc3ccc(Nc4ccnc5ccccc45)cc3[nH]2)cc1  6.742321\n",
       "1        Brc1ccc(-c2nc3ccc(Nc4ncnc5ccccc45)cc3[nH]2)cc1  6.419075\n",
       "2     C#CC(C)OC1=CC(=O)C(Nc2ncnc3cc(OCCCN4CCCC4)c(OC...  7.432974\n",
       "3     C#CC(OC1=CC(=O)C(Nc2ncnc3cc(OCCCN4CCCC4)c(OC)c...  8.108191\n",
       "4            C#CCO/N=C/c1c(N)ncnc1Oc1ccc2[nH]c(C)cc2c1F  7.275724\n",
       "...                                                 ...       ...\n",
       "4890       c1cn(Cc2ccc3c(c2)-c2[nH]nc(-c4ccsc4)c2C3)cn1  6.680695\n",
       "4891  c1cnc(CCc2ccncc2)c(-c2nnc(NCc3ccc4c(c3)OCO4)o2)c1  6.800804\n",
       "4892    c1cnc(CCc2ccncc2)c(-c2nnc(Nc3cnc4ccccc4c3)o2)c1  6.419075\n",
       "4893   c1cncc(-c2cnn3cc(-c4ccc(OCCN5CCCCC5)cc4)cnc23)c1  7.677781\n",
       "4894                   c1csc(-c2n[nH]c3c2Cc2ccccc2-3)c1  6.031050\n",
       "\n",
       "[4895 rows x 2 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kdr_continuous = clean_continuous(kdr)\n",
    "kdr_lo = pd.DataFrame({\n",
    "    'smiles': kdr_continuous['canonical_smiles'],\n",
    "    'value': kdr_continuous['standard_value']\n",
    "}).reset_index(drop=True)\n",
    "kdr_lo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "kdr_lo.to_csv('../../data/raw/kdr_lo.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "lohi_benchmark",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
