{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "import sys\n",
    "sys.path.append('../../code')\n",
    "from utils import calculate_circles_quick"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3878952/4181517134.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  slice = pd.read_csv('../../data/raw/chembl30_slice.csv')\n"
     ]
    }
   ],
   "source": [
    "slice = pd.read_csv('../../data/raw/chembl30_slice.csv')"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate #Circles Metric for Ki"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DRD2       8482\n",
       "ADORA2A    7373\n",
       "CA2        7280\n",
       "CA1        6820\n",
       "ADORA1     6717\n",
       "ADORA3     5735\n",
       "CNR2       5726\n",
       "OPRM1      5538\n",
       "HTR1A      5516\n",
       "CNR1       5355\n",
       "CA9        5251\n",
       "HRH3       4987\n",
       "DRD3       4947\n",
       "HTR2A      4653\n",
       "OPRD1      4514\n",
       "SLC6A4     4107\n",
       "CA12       4071\n",
       "HTR6       3697\n",
       "OPRK1      3258\n",
       "ADORA2B    2828\n",
       "Name: gene_symbol, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ki = slice[slice['standard_type'] == 'Ki']\n",
    "top_ki_genes = ki['gene_symbol'].value_counts().head(20)\n",
    "top_ki_genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 8482/8482 [00:01<00:00, 4391.63it/s]\n",
      "100%|██████████| 7373/7373 [00:01<00:00, 4484.42it/s]\n",
      "100%|██████████| 7280/7280 [00:01<00:00, 5531.93it/s] \n",
      "100%|██████████| 6820/6820 [00:01<00:00, 5452.15it/s] \n",
      "100%|██████████| 6717/6717 [00:01<00:00, 4592.59it/s]\n",
      "100%|██████████| 5735/5735 [00:01<00:00, 4568.33it/s]\n",
      "100%|██████████| 5726/5726 [00:01<00:00, 4739.76it/s]\n",
      "100%|██████████| 5538/5538 [00:01<00:00, 3911.58it/s]\n",
      "100%|██████████| 5516/5516 [00:01<00:00, 4419.15it/s]\n",
      "100%|██████████| 5355/5355 [00:01<00:00, 4715.93it/s]\n",
      "100%|██████████| 5251/5251 [00:01<00:00, 5195.91it/s]\n",
      "100%|██████████| 4987/4987 [00:00<00:00, 5474.09it/s]\n",
      "100%|██████████| 4947/4947 [00:01<00:00, 4684.79it/s]\n",
      "100%|██████████| 4653/4653 [00:00<00:00, 4689.14it/s]\n",
      "100%|██████████| 4514/4514 [00:01<00:00, 4022.19it/s]\n",
      "100%|██████████| 4107/4107 [00:00<00:00, 5025.48it/s]\n",
      "100%|██████████| 4071/4071 [00:00<00:00, 5319.22it/s]\n",
      "100%|██████████| 3697/3697 [00:00<00:00, 4740.55it/s]\n",
      "100%|██████████| 3258/3258 [00:00<00:00, 4395.69it/s]\n",
      "100%|██████████| 2828/2828 [00:00<00:00, 4838.12it/s]\n"
     ]
    }
   ],
   "source": [
    "circles = []\n",
    "\n",
    "for gene in top_ki_genes.index:\n",
    "    gene_data = ki[ki['gene_symbol'] == gene]\n",
    "    gene_smiles = gene_data['canonical_smiles'].to_list()\n",
    "    circles.append(calculate_circles_quick(gene_smiles))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene</th>\n",
       "      <th>total_mols</th>\n",
       "      <th>circles</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DRD2</td>\n",
       "      <td>8482</td>\n",
       "      <td>837</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ADORA2A</td>\n",
       "      <td>7373</td>\n",
       "      <td>750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CA2</td>\n",
       "      <td>7280</td>\n",
       "      <td>900</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CA1</td>\n",
       "      <td>6820</td>\n",
       "      <td>859</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ADORA1</td>\n",
       "      <td>6717</td>\n",
       "      <td>667</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>ADORA3</td>\n",
       "      <td>5735</td>\n",
       "      <td>477</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>CNR2</td>\n",
       "      <td>5726</td>\n",
       "      <td>553</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>OPRM1</td>\n",
       "      <td>5538</td>\n",
       "      <td>498</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>HTR1A</td>\n",
       "      <td>5516</td>\n",
       "      <td>668</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>CNR1</td>\n",
       "      <td>5355</td>\n",
       "      <td>549</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>CA9</td>\n",
       "      <td>5251</td>\n",
       "      <td>775</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>HRH3</td>\n",
       "      <td>4987</td>\n",
       "      <td>465</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>DRD3</td>\n",
       "      <td>4947</td>\n",
       "      <td>492</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>HTR2A</td>\n",
       "      <td>4653</td>\n",
       "      <td>649</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>OPRD1</td>\n",
       "      <td>4514</td>\n",
       "      <td>384</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>SLC6A4</td>\n",
       "      <td>4107</td>\n",
       "      <td>468</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>CA12</td>\n",
       "      <td>4071</td>\n",
       "      <td>624</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>HTR6</td>\n",
       "      <td>3697</td>\n",
       "      <td>551</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>OPRK1</td>\n",
       "      <td>3258</td>\n",
       "      <td>344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>ADORA2B</td>\n",
       "      <td>2828</td>\n",
       "      <td>254</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       gene  total_mols  circles\n",
       "0      DRD2        8482      837\n",
       "1   ADORA2A        7373      750\n",
       "2       CA2        7280      900\n",
       "3       CA1        6820      859\n",
       "4    ADORA1        6717      667\n",
       "5    ADORA3        5735      477\n",
       "6      CNR2        5726      553\n",
       "7     OPRM1        5538      498\n",
       "8     HTR1A        5516      668\n",
       "9      CNR1        5355      549\n",
       "10      CA9        5251      775\n",
       "11     HRH3        4987      465\n",
       "12     DRD3        4947      492\n",
       "13    HTR2A        4653      649\n",
       "14    OPRD1        4514      384\n",
       "15   SLC6A4        4107      468\n",
       "16     CA12        4071      624\n",
       "17     HTR6        3697      551\n",
       "18    OPRK1        3258      344\n",
       "19  ADORA2B        2828      254"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ki_genes_statistics = pd.DataFrame({\n",
    "    'gene': top_ki_genes.index,\n",
    "    'total_mols': top_ki_genes.values,\n",
    "    'circles': circles\n",
    "})\n",
    "\n",
    "ki_genes_statistics"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate #Circles Metric for IC50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "KCNH2      11159\n",
       "KDR         8826\n",
       "EGFR        8814\n",
       "BACE1       6220\n",
       "HDAC1       5501\n",
       "ACHE        5310\n",
       "HSD11B1     5201\n",
       "MAOB        4939\n",
       "JAK2        4489\n",
       "HDAC6       4377\n",
       "PIK3CA      4376\n",
       "BRD4        4113\n",
       "PDE10A      4096\n",
       "BRAF        3919\n",
       "PTGS2       3844\n",
       "MET         3744\n",
       "TRPV1       3670\n",
       "MAPK14      3551\n",
       "FAAH        3523\n",
       "FLT3        3430\n",
       "Name: gene_symbol, dtype: int64"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ic50 = slice[slice['standard_type'] == 'IC50']\n",
    "top_ic50_genes = ic50['gene_symbol'].value_counts().head(20)\n",
    "top_ic50_genes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 11159/11159 [00:03<00:00, 3229.29it/s]\n",
      "100%|██████████| 8826/8826 [00:02<00:00, 4001.46it/s]\n",
      "100%|██████████| 8814/8814 [00:02<00:00, 3886.56it/s]\n",
      "100%|██████████| 6220/6220 [00:01<00:00, 4394.82it/s]\n",
      "100%|██████████| 5501/5501 [00:01<00:00, 4606.88it/s]\n",
      "100%|██████████| 5310/5310 [00:01<00:00, 4483.46it/s]\n",
      "100%|██████████| 5201/5201 [00:01<00:00, 4843.99it/s]\n",
      "100%|██████████| 4939/4939 [00:00<00:00, 5753.68it/s]\n",
      "100%|██████████| 4489/4489 [00:01<00:00, 4261.72it/s]\n",
      "100%|██████████| 4377/4377 [00:00<00:00, 4881.30it/s]\n",
      "100%|██████████| 4376/4376 [00:00<00:00, 4439.69it/s]\n",
      "100%|██████████| 4113/4113 [00:00<00:00, 4586.64it/s]\n",
      "100%|██████████| 4096/4096 [00:00<00:00, 4621.39it/s]\n",
      "100%|██████████| 3919/3919 [00:00<00:00, 4462.95it/s]\n",
      "100%|██████████| 3844/3844 [00:00<00:00, 5237.63it/s]\n",
      "100%|██████████| 3744/3744 [00:00<00:00, 3897.03it/s]\n",
      "100%|██████████| 3670/3670 [00:00<00:00, 5027.30it/s]\n",
      "100%|██████████| 3551/3551 [00:00<00:00, 4459.57it/s]\n",
      "100%|██████████| 3523/3523 [00:00<00:00, 5195.87it/s]\n",
      "100%|██████████| 3430/3430 [00:00<00:00, 4146.38it/s]\n"
     ]
    }
   ],
   "source": [
    "circles = []\n",
    "\n",
    "for gene in top_ic50_genes.index:\n",
    "    gene_data = ic50[ic50['gene_symbol'] == gene]\n",
    "    gene_smiles = gene_data['canonical_smiles'].to_list()\n",
    "    circles.append(calculate_circles_quick(gene_smiles))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>gene</th>\n",
       "      <th>total_mols</th>\n",
       "      <th>circles</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>KCNH2</td>\n",
       "      <td>11159</td>\n",
       "      <td>2128</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KDR</td>\n",
       "      <td>8826</td>\n",
       "      <td>791</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>EGFR</td>\n",
       "      <td>8814</td>\n",
       "      <td>773</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>BACE1</td>\n",
       "      <td>6220</td>\n",
       "      <td>532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>HDAC1</td>\n",
       "      <td>5501</td>\n",
       "      <td>715</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>ACHE</td>\n",
       "      <td>5310</td>\n",
       "      <td>663</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>HSD11B1</td>\n",
       "      <td>5201</td>\n",
       "      <td>423</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>MAOB</td>\n",
       "      <td>4939</td>\n",
       "      <td>656</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>JAK2</td>\n",
       "      <td>4489</td>\n",
       "      <td>492</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>HDAC6</td>\n",
       "      <td>4377</td>\n",
       "      <td>613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>PIK3CA</td>\n",
       "      <td>4376</td>\n",
       "      <td>433</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>BRD4</td>\n",
       "      <td>4113</td>\n",
       "      <td>457</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>PDE10A</td>\n",
       "      <td>4096</td>\n",
       "      <td>380</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>BRAF</td>\n",
       "      <td>3919</td>\n",
       "      <td>313</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>PTGS2</td>\n",
       "      <td>3844</td>\n",
       "      <td>533</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>MET</td>\n",
       "      <td>3744</td>\n",
       "      <td>402</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>TRPV1</td>\n",
       "      <td>3670</td>\n",
       "      <td>314</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>MAPK14</td>\n",
       "      <td>3551</td>\n",
       "      <td>400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>FAAH</td>\n",
       "      <td>3523</td>\n",
       "      <td>378</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>FLT3</td>\n",
       "      <td>3430</td>\n",
       "      <td>372</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       gene  total_mols  circles\n",
       "0     KCNH2       11159     2128\n",
       "1       KDR        8826      791\n",
       "2      EGFR        8814      773\n",
       "3     BACE1        6220      532\n",
       "4     HDAC1        5501      715\n",
       "5      ACHE        5310      663\n",
       "6   HSD11B1        5201      423\n",
       "7      MAOB        4939      656\n",
       "8      JAK2        4489      492\n",
       "9     HDAC6        4377      613\n",
       "10   PIK3CA        4376      433\n",
       "11     BRD4        4113      457\n",
       "12   PDE10A        4096      380\n",
       "13     BRAF        3919      313\n",
       "14    PTGS2        3844      533\n",
       "15      MET        3744      402\n",
       "16    TRPV1        3670      314\n",
       "17   MAPK14        3551      400\n",
       "18     FAAH        3523      378\n",
       "19     FLT3        3430      372"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ic50_genes_statistics = pd.DataFrame({\n",
    "    'gene': top_ic50_genes.index,\n",
    "    'total_mols': top_ic50_genes.values,\n",
    "    'circles': circles\n",
    "})\n",
    "\n",
    "ic50_genes_statistics"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Calculate #Circles Metric for the HIV Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>smiles</th>\n",
       "      <th>activity</th>\n",
       "      <th>HIV_active</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>O=S(=O)(O)CCS(=O)(=O)O</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41122</th>\n",
       "      <td>CCC1CCC2c3c([nH]c4ccc(C)cc34)C3C(=O)N(N(C)C)C(...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41123</th>\n",
       "      <td>Cc1ccc2[nH]c3c(c2c1)C1CCC(C(C)(C)C)CC1C1C(=O)N...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41124</th>\n",
       "      <td>Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41125</th>\n",
       "      <td>Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41126</th>\n",
       "      <td>CCCCCC=C(c1cc(Cl)c(OC)c(-c2nc(C)no2)c1)c1cc(Cl...</td>\n",
       "      <td>CI</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>41127 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  smiles activity  HIV_active\n",
       "0      CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...       CI           0\n",
       "1      C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...       CI           0\n",
       "2                       CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21       CI           0\n",
       "3        Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1       CI           0\n",
       "4                                 O=S(=O)(O)CCS(=O)(=O)O       CI           0\n",
       "...                                                  ...      ...         ...\n",
       "41122  CCC1CCC2c3c([nH]c4ccc(C)cc34)C3C(=O)N(N(C)C)C(...       CI           0\n",
       "41123  Cc1ccc2[nH]c3c(c2c1)C1CCC(C(C)(C)C)CC1C1C(=O)N...       CI           0\n",
       "41124  Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C...       CI           0\n",
       "41125  Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)...       CI           0\n",
       "41126  CCCCCC=C(c1cc(Cl)c(OC)c(-c2nc(C)no2)c1)c1cc(Cl...       CI           0\n",
       "\n",
       "[41127 rows x 3 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hiv = pd.read_csv('../../data/raw/HIV.csv')\n",
    "hiv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 85%|████████▌ | 35128/41127 [01:15<00:34, 175.37it/s] [14:41:07] WARNING: not removing hydrogen atom without neighbors\n",
      "[14:41:07] WARNING: not removing hydrogen atom without neighbors\n",
      "100%|██████████| 41127/41127 [01:55<00:00, 357.22it/s]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "19222"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "calculate_circles_quick(hiv['smiles'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "qsar_longevity",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
