{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('../../code')\n",
    "\n",
    "from utils import binarize_log_data, remove_ambiguous_row, clean_continuous"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3880917/4181517134.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  slice = pd.read_csv('../../data/raw/chembl30_slice.csv')\n"
     ]
    }
   ],
   "source": [
    "slice = pd.read_csv('../../data/raw/chembl30_slice.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>compound_chembl_id</th>\n",
       "      <th>canonical_smiles</th>\n",
       "      <th>standard_type</th>\n",
       "      <th>standard_value</th>\n",
       "      <th>standard_units</th>\n",
       "      <th>standard_relation</th>\n",
       "      <th>assay_id</th>\n",
       "      <th>target_chembl_id</th>\n",
       "      <th>activity_comment</th>\n",
       "      <th>data_validity_comment</th>\n",
       "      <th>organism</th>\n",
       "      <th>pref_name</th>\n",
       "      <th>gene_symbol</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1673</th>\n",
       "      <td>43358</td>\n",
       "      <td>CHEMBL6437</td>\n",
       "      <td>CN1CCN2c3ccccc3Cc3ccccc3C2C1</td>\n",
       "      <td>Ki</td>\n",
       "      <td>5.657972</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>303146</td>\n",
       "      <td>CHEMBL217</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1976</th>\n",
       "      <td>48837</td>\n",
       "      <td>CHEMBL269004</td>\n",
       "      <td>CCCN(CCC)[C@H]1CCc2c(O)cccc2C1</td>\n",
       "      <td>Ki</td>\n",
       "      <td>7.223299</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>851588</td>\n",
       "      <td>CHEMBL339</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Rattus norvegicus</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1981</th>\n",
       "      <td>48844</td>\n",
       "      <td>CHEMBL269004</td>\n",
       "      <td>CCCN(CCC)[C@H]1CCc2c(O)cccc2C1</td>\n",
       "      <td>Ki</td>\n",
       "      <td>7.221849</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>880031</td>\n",
       "      <td>CHEMBL339</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Rattus norvegicus</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1983</th>\n",
       "      <td>48846</td>\n",
       "      <td>CHEMBL269004</td>\n",
       "      <td>CCCN(CCC)[C@H]1CCc2c(O)cccc2C1</td>\n",
       "      <td>Ki</td>\n",
       "      <td>7.223299</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>961547</td>\n",
       "      <td>CHEMBL339</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Rattus norvegicus</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1985</th>\n",
       "      <td>48848</td>\n",
       "      <td>CHEMBL269004</td>\n",
       "      <td>CCCN(CCC)[C@H]1CCc2c(O)cccc2C1</td>\n",
       "      <td>Ki</td>\n",
       "      <td>6.812479</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>1541731</td>\n",
       "      <td>CHEMBL339</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Rattus norvegicus</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669207</th>\n",
       "      <td>16430106</td>\n",
       "      <td>CHEMBL4878501</td>\n",
       "      <td>O=C(NCCCCN1CCO[C@@H]2c3ccccc3OC[C@H]21)c1ccc2c...</td>\n",
       "      <td>Ki</td>\n",
       "      <td>6.140261</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2116072</td>\n",
       "      <td>CHEMBL217</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669271</th>\n",
       "      <td>16430350</td>\n",
       "      <td>CHEMBL4878577</td>\n",
       "      <td>O=C(NCCCCN1CCO[C@@H]2c3ccccc3OC[C@H]21)c1cccc(...</td>\n",
       "      <td>Ki</td>\n",
       "      <td>6.177832</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2116072</td>\n",
       "      <td>CHEMBL217</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669278</th>\n",
       "      <td>16430368</td>\n",
       "      <td>CHEMBL4878587</td>\n",
       "      <td>COc1cc2c(cc1OC)CN(Cc1ccccc1CNC(=O)c1ccc(C#N)cc...</td>\n",
       "      <td>Ki</td>\n",
       "      <td>7.292430</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2113158</td>\n",
       "      <td>CHEMBL217</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669412</th>\n",
       "      <td>16431038</td>\n",
       "      <td>CHEMBL4878826</td>\n",
       "      <td>O=C(NCCCCN1CCO[C@@H]2c3ccccc3OC[C@H]21)c1ccc(C...</td>\n",
       "      <td>Ki</td>\n",
       "      <td>6.324222</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2116072</td>\n",
       "      <td>CHEMBL217</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2669534</th>\n",
       "      <td>16431816</td>\n",
       "      <td>CHEMBL4879054</td>\n",
       "      <td>Fc1ccc2c(C3CCN(CCCOc4ccc(CN5CCCC5)cc4Cl)CC3)no...</td>\n",
       "      <td>Ki</td>\n",
       "      <td>7.821023</td>\n",
       "      <td>nM</td>\n",
       "      <td>=</td>\n",
       "      <td>2112202</td>\n",
       "      <td>CHEMBL217</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Homo sapiens</td>\n",
       "      <td>Dopamine D2 receptor</td>\n",
       "      <td>DRD2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8482 rows × 14 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         Unnamed: 0 compound_chembl_id  \\\n",
       "1673          43358         CHEMBL6437   \n",
       "1976          48837       CHEMBL269004   \n",
       "1981          48844       CHEMBL269004   \n",
       "1983          48846       CHEMBL269004   \n",
       "1985          48848       CHEMBL269004   \n",
       "...             ...                ...   \n",
       "2669207    16430106      CHEMBL4878501   \n",
       "2669271    16430350      CHEMBL4878577   \n",
       "2669278    16430368      CHEMBL4878587   \n",
       "2669412    16431038      CHEMBL4878826   \n",
       "2669534    16431816      CHEMBL4879054   \n",
       "\n",
       "                                          canonical_smiles standard_type  \\\n",
       "1673                          CN1CCN2c3ccccc3Cc3ccccc3C2C1            Ki   \n",
       "1976                        CCCN(CCC)[C@H]1CCc2c(O)cccc2C1            Ki   \n",
       "1981                        CCCN(CCC)[C@H]1CCc2c(O)cccc2C1            Ki   \n",
       "1983                        CCCN(CCC)[C@H]1CCc2c(O)cccc2C1            Ki   \n",
       "1985                        CCCN(CCC)[C@H]1CCc2c(O)cccc2C1            Ki   \n",
       "...                                                    ...           ...   \n",
       "2669207  O=C(NCCCCN1CCO[C@@H]2c3ccccc3OC[C@H]21)c1ccc2c...            Ki   \n",
       "2669271  O=C(NCCCCN1CCO[C@@H]2c3ccccc3OC[C@H]21)c1cccc(...            Ki   \n",
       "2669278  COc1cc2c(cc1OC)CN(Cc1ccccc1CNC(=O)c1ccc(C#N)cc...            Ki   \n",
       "2669412  O=C(NCCCCN1CCO[C@@H]2c3ccccc3OC[C@H]21)c1ccc(C...            Ki   \n",
       "2669534  Fc1ccc2c(C3CCN(CCCOc4ccc(CN5CCCC5)cc4Cl)CC3)no...            Ki   \n",
       "\n",
       "         standard_value standard_units standard_relation  assay_id  \\\n",
       "1673           5.657972             nM                 =    303146   \n",
       "1976           7.223299             nM                 =    851588   \n",
       "1981           7.221849             nM                 =    880031   \n",
       "1983           7.223299             nM                 =    961547   \n",
       "1985           6.812479             nM                 =   1541731   \n",
       "...                 ...            ...               ...       ...   \n",
       "2669207        6.140261             nM                 =   2116072   \n",
       "2669271        6.177832             nM                 =   2116072   \n",
       "2669278        7.292430             nM                 =   2113158   \n",
       "2669412        6.324222             nM                 =   2116072   \n",
       "2669534        7.821023             nM                 =   2112202   \n",
       "\n",
       "        target_chembl_id activity_comment data_validity_comment  \\\n",
       "1673           CHEMBL217              NaN                   NaN   \n",
       "1976           CHEMBL339              NaN                   NaN   \n",
       "1981           CHEMBL339              NaN                   NaN   \n",
       "1983           CHEMBL339              NaN                   NaN   \n",
       "1985           CHEMBL339              NaN                   NaN   \n",
       "...                  ...              ...                   ...   \n",
       "2669207        CHEMBL217              NaN                   NaN   \n",
       "2669271        CHEMBL217              NaN                   NaN   \n",
       "2669278        CHEMBL217              NaN                   NaN   \n",
       "2669412        CHEMBL217              NaN                   NaN   \n",
       "2669534        CHEMBL217              NaN                   NaN   \n",
       "\n",
       "                  organism             pref_name gene_symbol  \n",
       "1673          Homo sapiens  Dopamine D2 receptor        DRD2  \n",
       "1976     Rattus norvegicus  Dopamine D2 receptor        DRD2  \n",
       "1981     Rattus norvegicus  Dopamine D2 receptor        DRD2  \n",
       "1983     Rattus norvegicus  Dopamine D2 receptor        DRD2  \n",
       "1985     Rattus norvegicus  Dopamine D2 receptor        DRD2  \n",
       "...                    ...                   ...         ...  \n",
       "2669207       Homo sapiens  Dopamine D2 receptor        DRD2  \n",
       "2669271       Homo sapiens  Dopamine D2 receptor        DRD2  \n",
       "2669278       Homo sapiens  Dopamine D2 receptor        DRD2  \n",
       "2669412       Homo sapiens  Dopamine D2 receptor        DRD2  \n",
       "2669534       Homo sapiens  Dopamine D2 receptor        DRD2  \n",
       "\n",
       "[8482 rows x 14 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "slice['standard_value'] = slice['standard_value'].map(lambda value: -np.log10((max(value, 0) + 1)) + 9)\n",
    "slice = slice[slice['standard_type'] == 'Ki']\n",
    "drd = slice[slice['gene_symbol'] == 'DRD2']\n",
    "drd"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare DRD2-Hi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>smiles</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brc1ccc(-[n+]2cc[n+](Cc3ccccc3)cc2)c2cc[nH]c12</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brc1ccc(CNCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Brc1ccc(N2CCN(Cc3ccccc3)CC2)c2cc[nH]c12</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Brc1ccc(NCCN2CCN(CCc3c[nH]c4ccccc34)CC2)cc1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Brc1ccc(NCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6262</th>\n",
       "      <td>c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5o4)cc3)CC2)nc1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6263</th>\n",
       "      <td>c1cnc(N2CCN(CCCSc3nc4ccccc4s3)CC2)nc1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6264</th>\n",
       "      <td>c1cnc(N2CCN(Cc3c[nH]c4ncccc34)CC2)nc1</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6265</th>\n",
       "      <td>c1cncc(CN[C@H]2C3C4CC5C6C4CC3C6C52)c1</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6266</th>\n",
       "      <td>c1nc2c(s1)CCN(CCCCN1CCc3ncsc3CC1)CC2</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6267 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                smiles  value\n",
       "0       Brc1ccc(-[n+]2cc[n+](Cc3ccccc3)cc2)c2cc[nH]c12   True\n",
       "1          Brc1ccc(CNCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1  False\n",
       "2              Brc1ccc(N2CCN(Cc3ccccc3)CC2)c2cc[nH]c12   True\n",
       "3          Brc1ccc(NCCN2CCN(CCc3c[nH]c4ccccc34)CC2)cc1   True\n",
       "4           Brc1ccc(NCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1   True\n",
       "...                                                ...    ...\n",
       "6262  c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5o4)cc3)CC2)nc1   True\n",
       "6263             c1cnc(N2CCN(CCCSc3nc4ccccc4s3)CC2)nc1   True\n",
       "6264             c1cnc(N2CCN(Cc3c[nH]c4ncccc34)CC2)nc1  False\n",
       "6265             c1cncc(CN[C@H]2C3C4CC5C6C4CC3C6C52)c1  False\n",
       "6266              c1nc2c(s1)CCN(CCCCN1CCc3ncsc3CC1)CC2  False\n",
       "\n",
       "[6267 rows x 2 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drd_binary = binarize_log_data(drd, threshold=6)\n",
    "drd_binary = remove_ambiguous_row(drd_binary)\n",
    "drd_hi = pd.DataFrame({\n",
    "    'smiles': drd_binary.index.to_list(),\n",
    "    'value': drd_binary['label'] > 0.5\n",
    "}).reset_index(drop=True)\n",
    "drd_hi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "drd_hi.to_csv('../../data/raw/drd2_hi.csv')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Prepare DRD2-Lo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/data/steshin/gero_benchmark/notebooks/data/../../code/utils.py:85: FutureWarning: Dropping invalid columns in DataFrameGroupBy.min is deprecated. In a future version, a TypeError will be raised. Before calling .min, select only columns which should be valid for the function.\n",
      "  min_values = group.min()\n",
      "/data/steshin/gero_benchmark/notebooks/data/../../code/utils.py:86: FutureWarning: Dropping invalid columns in DataFrameGroupBy.max is deprecated. In a future version, a TypeError will be raised. Before calling .max, select only columns which should be valid for the function.\n",
      "  max_values = group.max()\n",
      "/data/steshin/gero_benchmark/notebooks/data/../../code/utils.py:90: FutureWarning: The default value of numeric_only in DataFrameGroupBy.median is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.\n",
      "  continuous_clean = group.median()\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>smiles</th>\n",
       "      <th>value</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Brc1ccc(-[n+]2cc[n+](Cc3ccccc3)cc2)c2cc[nH]c12</td>\n",
       "      <td>7.717691</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Brc1ccc(CNCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1</td>\n",
       "      <td>5.283913</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Brc1ccc(N2CCN(Cc3ccccc3)CC2)c2cc[nH]c12</td>\n",
       "      <td>7.437357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Brc1ccc(NCCN2CCN(CCc3c[nH]c4ccccc34)CC2)cc1</td>\n",
       "      <td>7.288705</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Brc1ccc(NCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1</td>\n",
       "      <td>6.035740</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5414</th>\n",
       "      <td>c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5[nH]4)cc3)CC...</td>\n",
       "      <td>6.568636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5415</th>\n",
       "      <td>c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5o4)cc3)CC2)nc1</td>\n",
       "      <td>6.701147</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5416</th>\n",
       "      <td>c1cnc(N2CCN(CCCSc3nc4ccccc4s3)CC2)nc1</td>\n",
       "      <td>6.273273</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5417</th>\n",
       "      <td>c1cnc(N2CCN(Cc3c[nH]c4ncccc34)CC2)nc1</td>\n",
       "      <td>5.931443</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5418</th>\n",
       "      <td>c1nc2c(s1)CCN(CCCCN1CCc3ncsc3CC1)CC2</td>\n",
       "      <td>5.199931</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5419 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 smiles     value\n",
       "0        Brc1ccc(-[n+]2cc[n+](Cc3ccccc3)cc2)c2cc[nH]c12  7.717691\n",
       "1           Brc1ccc(CNCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1  5.283913\n",
       "2               Brc1ccc(N2CCN(Cc3ccccc3)CC2)c2cc[nH]c12  7.437357\n",
       "3           Brc1ccc(NCCN2CCN(CCc3c[nH]c4ccccc34)CC2)cc1  7.288705\n",
       "4            Brc1ccc(NCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1  6.035740\n",
       "...                                                 ...       ...\n",
       "5414  c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5[nH]4)cc3)CC...  6.568636\n",
       "5415   c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5o4)cc3)CC2)nc1  6.701147\n",
       "5416              c1cnc(N2CCN(CCCSc3nc4ccccc4s3)CC2)nc1  6.273273\n",
       "5417              c1cnc(N2CCN(Cc3c[nH]c4ncccc34)CC2)nc1  5.931443\n",
       "5418               c1nc2c(s1)CCN(CCCCN1CCc3ncsc3CC1)CC2  5.199931\n",
       "\n",
       "[5419 rows x 2 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "drd_continuous = clean_continuous(drd)\n",
    "drd_lo = pd.DataFrame({\n",
    "    'smiles': drd_continuous['canonical_smiles'],\n",
    "    'value': drd_continuous['standard_value']\n",
    "}).reset_index(drop=True)\n",
    "drd_lo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "drd_lo.to_csv('../../data/raw/drd2_lo.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "gero_benchmark",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
