{
 "cells": [
  {
   "cell_type": "code",
   "id": "fb1fa966-2be2-47e3-ab5f-5f2044d933ab",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-01T14:13:55.018032Z",
     "start_time": "2025-07-01T14:13:54.980859Z"
    }
   },
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2\n",
    "\n",
    "import sys\n",
    "import os\n",
    "import pandas as pd\n",
    "from pathlib import Path\n",
    "import itertools\n",
    "import numpy as np\n",
    "import json\n",
    "import re\n",
    "import copy\n",
    "from scipy.stats import mannwhitneyu\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import starbars"
   ],
   "outputs": [],
   "execution_count": 1
  },
  {
   "cell_type": "code",
   "id": "ba9057d8-d80b-4002-9bdf-c6cd3ded50eb",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-01T14:13:55.378409Z",
     "start_time": "2025-07-01T14:13:55.262621Z"
    }
   },
   "source": [
    "from rdkit import RDLogger\n",
    "RDLogger.DisableLog(\"rdApp.*\")\n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\", category=UserWarning)"
   ],
   "outputs": [],
   "execution_count": 2
  },
  {
   "cell_type": "code",
   "id": "3de531226756e515",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-01T14:13:58.026424Z",
     "start_time": "2025-07-01T14:13:57.977877Z"
    }
   },
   "source": [
    "cpath = Path(os.getcwd())\n",
    "output_dir =  cpath.parent / \"output\" / \"vina\"\n",
    "os.makedirs(output_dir, exist_ok=True)\n",
    "pdb_dir = cpath.parent / \"data\" / \"CrossDocked2020\"\n",
    "dataset_dir = cpath.parent / \"data\"\n",
    "results_path = cpath.parent / \"results\" / \"vina\"\n",
    "\n",
    "RUN_DOCKING=False"
   ],
   "outputs": [],
   "execution_count": 3
  },
  {
   "cell_type": "code",
   "id": "de6e78ba2bc69df2",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-01T14:14:22.570438Z",
     "start_time": "2025-07-01T14:14:22.536891Z"
    }
   },
   "source": [
    "datasets = [\n",
    "    'bindingdb', # reference dataset should be the first\n",
    "    'bindingdb_active',\n",
    "    'protobind_diff',\n",
    "    'pocket2mol',\n",
    "    'pocketflow',\n",
    "    'targetdiff',\n",
    "    'reinvent',\n",
    "    'tamgen',\n",
    "]\n",
    "annotation = pd.read_csv(cpath.parent / \"paper\" / 'tables' / 'selected_targets_benchmark.csv', index_col='Name')\n",
    "gene2pdb = dict(annotation['PDB name'])\n",
    "annotation"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "        Number of samples in the train set Dataset type  \\\n",
       "Name                                                      \n",
       "ESR1                                  4483         easy   \n",
       "HCRTR1                               12691         easy   \n",
       "JAK1                                 12455         easy   \n",
       "P2RX3                                 5140         easy   \n",
       "KDM1A                                 4622         easy   \n",
       "IDH1                                  5177         easy   \n",
       "RIOK1                                   15         hard   \n",
       "NR4A1                                   28         hard   \n",
       "GRIK1                                  335         hard   \n",
       "FTO                                     37         hard   \n",
       "SPIN1                                   19         hard   \n",
       "CCR9                                    82         hard   \n",
       "\n",
       "                               Family name UniProt ID PDB name  \\\n",
       "Name                                                             \n",
       "ESR1                      Nuclear receptor     P03372     2r6w   \n",
       "HCRTR1                                GPCR     O43613     4zjc   \n",
       "JAK1                                Kinase     P23458     3eyg   \n",
       "P2RX3                          Ion channel     P56373     5svl   \n",
       "KDM1A   Protein-protein interaction target     O60341     5lhg   \n",
       "IDH1                     Non-kinase enzyme     O75874     4umx   \n",
       "RIOK1                               Kinase     Q9BRS2     4otp   \n",
       "NR4A1                     Nuclear receptor     P22736     3v3q   \n",
       "GRIK1                          Ion channel     P39086     3fv1   \n",
       "FTO                      Non-kinase enzyme     Q9C0B1     4zs3   \n",
       "SPIN1   Protein-protein interaction target     Q9Y657     5jsj   \n",
       "CCR9                                  GPCR     P51686     5lwe   \n",
       "\n",
       "              L1 family name                       L2 family name  \\\n",
       "Name                                                                \n",
       "ESR1    Transcription factor                     Nuclear receptor   \n",
       "HCRTR1     Membrane receptor  Family A G protein-coupled receptor   \n",
       "JAK1                  Enzyme                               Kinase   \n",
       "P2RX3            Ion channel             Ligand-gated ion channel   \n",
       "KDM1A   Epigenetic regulator                               Eraser   \n",
       "IDH1                  Enzyme                       Oxidoreductase   \n",
       "RIOK1                 Enzyme                               Kinase   \n",
       "NR4A1   Transcription factor                     Nuclear receptor   \n",
       "GRIK1            Ion channel             Ligand-gated ion channel   \n",
       "FTO                   Enzyme                       Oxidoreductase   \n",
       "SPIN1   Epigenetic regulator                               Reader   \n",
       "CCR9       Membrane receptor  Family A G protein-coupled receptor   \n",
       "\n",
       "        sequence_int  \n",
       "Name                  \n",
       "ESR1             138  \n",
       "HCRTR1           176  \n",
       "JAK1            1145  \n",
       "P2RX3           2054  \n",
       "KDM1A           2281  \n",
       "IDH1            2908  \n",
       "RIOK1           1163  \n",
       "NR4A1           1230  \n",
       "GRIK1           1852  \n",
       "FTO             5927  \n",
       "SPIN1           7922  \n",
       "CCR9            4006  "
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Number of samples in the train set</th>\n",
       "      <th>Dataset type</th>\n",
       "      <th>Family name</th>\n",
       "      <th>UniProt ID</th>\n",
       "      <th>PDB name</th>\n",
       "      <th>L1 family name</th>\n",
       "      <th>L2 family name</th>\n",
       "      <th>sequence_int</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Name</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>4483</td>\n",
       "      <td>easy</td>\n",
       "      <td>Nuclear receptor</td>\n",
       "      <td>P03372</td>\n",
       "      <td>2r6w</td>\n",
       "      <td>Transcription factor</td>\n",
       "      <td>Nuclear receptor</td>\n",
       "      <td>138</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>12691</td>\n",
       "      <td>easy</td>\n",
       "      <td>GPCR</td>\n",
       "      <td>O43613</td>\n",
       "      <td>4zjc</td>\n",
       "      <td>Membrane receptor</td>\n",
       "      <td>Family A G protein-coupled receptor</td>\n",
       "      <td>176</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>12455</td>\n",
       "      <td>easy</td>\n",
       "      <td>Kinase</td>\n",
       "      <td>P23458</td>\n",
       "      <td>3eyg</td>\n",
       "      <td>Enzyme</td>\n",
       "      <td>Kinase</td>\n",
       "      <td>1145</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>5140</td>\n",
       "      <td>easy</td>\n",
       "      <td>Ion channel</td>\n",
       "      <td>P56373</td>\n",
       "      <td>5svl</td>\n",
       "      <td>Ion channel</td>\n",
       "      <td>Ligand-gated ion channel</td>\n",
       "      <td>2054</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>4622</td>\n",
       "      <td>easy</td>\n",
       "      <td>Protein-protein interaction target</td>\n",
       "      <td>O60341</td>\n",
       "      <td>5lhg</td>\n",
       "      <td>Epigenetic regulator</td>\n",
       "      <td>Eraser</td>\n",
       "      <td>2281</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>5177</td>\n",
       "      <td>easy</td>\n",
       "      <td>Non-kinase enzyme</td>\n",
       "      <td>O75874</td>\n",
       "      <td>4umx</td>\n",
       "      <td>Enzyme</td>\n",
       "      <td>Oxidoreductase</td>\n",
       "      <td>2908</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>15</td>\n",
       "      <td>hard</td>\n",
       "      <td>Kinase</td>\n",
       "      <td>Q9BRS2</td>\n",
       "      <td>4otp</td>\n",
       "      <td>Enzyme</td>\n",
       "      <td>Kinase</td>\n",
       "      <td>1163</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>28</td>\n",
       "      <td>hard</td>\n",
       "      <td>Nuclear receptor</td>\n",
       "      <td>P22736</td>\n",
       "      <td>3v3q</td>\n",
       "      <td>Transcription factor</td>\n",
       "      <td>Nuclear receptor</td>\n",
       "      <td>1230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>335</td>\n",
       "      <td>hard</td>\n",
       "      <td>Ion channel</td>\n",
       "      <td>P39086</td>\n",
       "      <td>3fv1</td>\n",
       "      <td>Ion channel</td>\n",
       "      <td>Ligand-gated ion channel</td>\n",
       "      <td>1852</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>37</td>\n",
       "      <td>hard</td>\n",
       "      <td>Non-kinase enzyme</td>\n",
       "      <td>Q9C0B1</td>\n",
       "      <td>4zs3</td>\n",
       "      <td>Enzyme</td>\n",
       "      <td>Oxidoreductase</td>\n",
       "      <td>5927</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>19</td>\n",
       "      <td>hard</td>\n",
       "      <td>Protein-protein interaction target</td>\n",
       "      <td>Q9Y657</td>\n",
       "      <td>5jsj</td>\n",
       "      <td>Epigenetic regulator</td>\n",
       "      <td>Reader</td>\n",
       "      <td>7922</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>82</td>\n",
       "      <td>hard</td>\n",
       "      <td>GPCR</td>\n",
       "      <td>P51686</td>\n",
       "      <td>5lwe</td>\n",
       "      <td>Membrane receptor</td>\n",
       "      <td>Family A G protein-coupled receptor</td>\n",
       "      <td>4006</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 4
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f26c1b34-db4f-4df6-b833-6700c7da58d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# DOCKER binary\n",
    "CONFIG = 'laptop'\n",
    "\n",
    "#### do not change these\n",
    "dockstream_path = '' #path to dockstream\n",
    "target_preparator = dockstream_path + \"/target_preparator.py\"\n",
    "docker = dockstream_path + \"/docker.py\"\n",
    "vina_binary_location = \"\"\n",
    "# docking_config = output_dir / 'config' / 'vina_docking.json'\n",
    "\n",
    "\n",
    "# DOKING PARAMS\n",
    "num_poses = 3\n",
    "number_cores = os.cpu_count()\n",
    "exhaustiveness = 16\n",
    "delta_X = 5\n",
    "delta_Y = 5\n",
    "delta_Z = 5\n",
    "box_min_size=20"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dee89448-5769-47db-8bc4-20931057cedb",
   "metadata": {},
   "source": [
    "## Preprocess PDB and pockets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "b4ad9d44-a00b-4ab0-99fc-267eb630cb3b",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for gene, pdb in gene2pdb.items():\n",
    "    if not RUN_DOCKING:\n",
    "        continue\n",
    "    pdb_protein_path = pdb_dir / pdb /  f\"{pdb}_protein_cleaned.pdb\"\n",
    "    reference_ligand_path = pdb_dir / pdb /  f\"{pdb}_ligand.sdf\"\n",
    "    assert pdb_protein_path.exists()\n",
    "    assert reference_ligand_path.exists()\n",
    "\n",
    "    \n",
    "    # generate output paths for the configuration file, the \"fixed\" PDB file and the \"rDock\" cavity\n",
    "    target_prep_path = output_dir / f\"{gene}_prep.json\"\n",
    "    fixed_pdb_path = output_dir / f\"{gene}_fixed_target.pdb\"\n",
    "    pdbqt_path = output_dir / f\"{gene}.pdbqt\"\n",
    "    log_file_target_prep = output_dir / f\"{gene}_target_prep.log\"\n",
    "\n",
    "    \n",
    "    # specify the target preparation JSON file as a dictionary and write it out\n",
    "    tp_dict = {\n",
    "      \"target_preparation\":\n",
    "      {\n",
    "        \"header\": {                                   # general settings\n",
    "          \"logging\": {                                # logging settings (e.g. which file to write to)\n",
    "            \"logfile\": str(log_file_target_prep)\n",
    "          }\n",
    "        },\n",
    "        \"input_path\": str(pdb_protein_path),          # this should be an absolute path\n",
    "        \"fixer\": {                                    # based on \"PDBFixer\"; tries to fix common problems with PDB files\n",
    "          \"enabled\": True,\n",
    "          \"standardize\": True,                        # enables standardization of residues\n",
    "          \"remove_heterogens\": True,                  # remove hetero-entries\n",
    "          \"fix_missing_heavy_atoms\": True,            # if possible, fix missing heavy atoms\n",
    "          \"fix_missing_hydrogens\": True,              # add hydrogens, which are usually not present in PDB files\n",
    "          \"fix_missing_loops\": False,                 # add missing loops; CAUTION: the result is usually not sufficient\n",
    "          \"add_water_box\": False,                     # if you want to put the receptor into a box of water molecules\n",
    "          \"fixed_pdb_path\": str(fixed_pdb_path)            # if specified and not \"None\", the fixed PDB file will be stored here\n",
    "        },\n",
    "        \"runs\": [                                     # \"runs\" holds a list of backend runs; at least one is required\n",
    "          {\n",
    "            \"backend\": \"AutoDockVina\",                # one of the backends supported (\"AutoDockVina\", \"OpenEye\", ...)\n",
    "            \"output\": {\n",
    "              \"receptor_path\": str(pdbqt_path)        # the generated receptor file will be saved to this location\n",
    "            },\n",
    "            \"parameters\": {\n",
    "              \"pH\": 7.4,                              # sets the protonation states (NOT used in Vina)\n",
    "              \"extract_box\": {                        # in order to extract the coordinates of the pocket (see text)\n",
    "                \"reference_ligand_path\":              # path to the reference ligand\n",
    "                  str(reference_ligand_path),  \n",
    "                \"reference_ligand_format\": \"sdf\"      # format of the reference ligand\n",
    "              }\n",
    "    }}]}}\n",
    "    \n",
    "    with open(target_prep_path, 'w') as f:\n",
    "        json.dump(tp_dict, f, indent=\"    \")\n",
    "        \n",
    "    # execute this in a command-line environment after replacing the parameters\n",
    "    !{sys.executable} {target_preparator} -conf {target_prep_path}\n",
    "    !head -n 25 {log_file_target_prep}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "903c149b-9f52-4c0f-aa65-bef8b697e2ab",
   "metadata": {},
   "source": [
    "## Start DOCKING"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f703ae8-0344-4a78-aa2e-cf7087dc5f35",
   "metadata": {},
   "source": [
    "### Default config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a9196aeb-7162-43cd-80cb-4d49083cd167",
   "metadata": {},
   "outputs": [],
   "source": [
    "default_config = {\n",
    "  \"docking\": {\n",
    "    \"header\": {\n",
    "      \"logging\": {\n",
    "        \"logfile\": \"vina_docking.log\"\n",
    "      }\n",
    "    },\n",
    "    \"ligand_preparation\": {\n",
    "      \"embedding_pools\": [\n",
    "        {\n",
    "          \"pool_id\": \"RDkit_pool\",\n",
    "          \"type\": \"RDkit\",\n",
    "          \"parameters\": {\n",
    "            \"removeHs\": False,\n",
    "            \"coordinate_generation\": {\n",
    "              \"method\": \"UFF\",\n",
    "              \"maximum_iterations\": 300\n",
    "            }\n",
    "          },\n",
    "          \"input\": {\n",
    "            \"standardize_smiles\": False,\n",
    "            \"type\": \"csv\",\n",
    "            \"input_path\": \".\",\n",
    "            \"delimiter\": \",\",\n",
    "            \"columns\": {\n",
    "              \"smiles\": \"SMILES\",\n",
    "              \"names\": \"drug_id\"\n",
    "            }\n",
    "          },\n",
    "          \"output\": {\n",
    "            \"conformer_path\": \"<conformer_path>\",\n",
    "            \"format\": \"sdf\"\n",
    "          }\n",
    "        }\n",
    "      ]\n",
    "    },\n",
    "    \"docking_runs\": [\n",
    "      {\n",
    "        \"backend\": \"AutoDockVina\",\n",
    "        \"run_id\": \"AutoDockVina\",\n",
    "        \"input_pools\": [\n",
    "          \"RDkit_pool\"\n",
    "        ],\n",
    "        \"parameters\": {\n",
    "          \"binary_location\": \"<binary_path>\",\n",
    "          \"parallelization\": {\n",
    "            \"number_cores\": 16\n",
    "          },\n",
    "          \"seed\": 42,\n",
    "          \"receptor_pdbqt_path\": [\n",
    "            \"<receptor_path>\"\n",
    "          ],\n",
    "          \"number_poses\": 5,\n",
    "          \"exhaustiveness\": 16,\n",
    "          \"search_space\": {\n",
    "            \"--center_x\": None,\n",
    "            \"--center_y\": None,\n",
    "            \"--center_z\": None,\n",
    "            \"--size_x\": None,\n",
    "            \"--size_y\": None,\n",
    "            \"--size_z\": None\n",
    "          }\n",
    "        },\n",
    "        \"output\": {\n",
    "          \"poses\": {\n",
    "            \"poses_path\": \"<poses_path>\"\n",
    "          },\n",
    "          \"scores\": {\n",
    "            \"scores_path\": \"<scores_path>\"\n",
    "          }\n",
    "        }\n",
    "      }\n",
    "    ]\n",
    "  }\n",
    "}\n",
    "\n",
    "def get_box_params(filename):\n",
    "    with open(filename, 'r') as f:\n",
    "        text = f.read()\n",
    "\n",
    "    # Regular expressions to match coordinate lines\n",
    "    x_match = re.search(r\"X coordinates: min=([\\d\\.\\-]+), max=([\\d\\.\\-]+)\", text)\n",
    "    y_match = re.search(r\"Y coordinates: min=([\\d\\.\\-]+), max=([\\d\\.\\-]+)\", text)\n",
    "    z_match = re.search(r\"Z coordinates: min=([\\d\\.\\-]+), max=([\\d\\.\\-]+)\", text)\n",
    "\n",
    "    xmin, xmax = float(x_match.group(1)), float(x_match.group(2))\n",
    "    ymin, ymax = float(y_match.group(1)), float(y_match.group(2))\n",
    "    zmin, zmax = float(z_match.group(1)), float(z_match.group(2))\n",
    "\n",
    "    return  xmin, xmax, ymin, ymax, zmin, zmax"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "27758a7c-5832-42eb-b0d3-05a8d352adcf",
   "metadata": {},
   "source": [
    "### Run dockig for each dataset and target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de6a6015a389eeec",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "for dataset in datasets:\n",
    "    if not RUN_DOCKING:\n",
    "        continue\n",
    "    for gene, pdb in gene2pdb.items():\n",
    "        \n",
    "        output_dataset_dir = output_dir / dataset\n",
    "        # Run test\n",
    "        ligand_input_csv = dataset_dir / dataset / f\"boltz_{gene}.csv\"\n",
    "        ligands_prep_path = output_dataset_dir / f\"{gene}_prep_ligands.sdf\"\n",
    "        ligands_docked_path = output_dataset_dir / f\"{gene}_docked_ligands.sdf\"\n",
    "        ligands_scores_path = output_dataset_dir / f\"{gene}_scores.csv\"\n",
    "        config_tmp_dir = output_dataset_dir  / f\"{gene}_vina_docking.json\"\n",
    "        ligands_docked_log = output_dataset_dir / f\"{gene}_vina_docking.log\"\n",
    "        pdbqt_path = output_dir / f\"{gene}.pdbqt\"\n",
    "\n",
    "        if not ligand_input_csv.exists():\n",
    "            print(f\"Input file file for {gene} not found in {output_dataset_dir}\")\n",
    "            continue\n",
    "\n",
    "        if ligands_scores_path.exists():\n",
    "            print(f\"Scores for {gene} found in {output_dataset_dir}. Spipping\")\n",
    "            continue\n",
    "\n",
    "        os.makedirs(output_dataset_dir, exist_ok=True)\n",
    "\n",
    "        print(f\"Running docking for {dataset}/{gene}\")\n",
    "        conf = copy.deepcopy(default_config)\n",
    "        conf['docking']['docking_runs'][0]['output']['poses']['poses_path'] = str(ligands_docked_path)\n",
    "        conf['docking']['docking_runs'][0]['output']['scores']['scores_path'] = str(ligands_scores_path)\n",
    "        conf['docking']['header']['logging']['logfile'] = str(ligands_docked_log)\n",
    "        conf['docking']['ligand_preparation']['embedding_pools'][0]['output']['conformer_path'] = str(ligands_prep_path)\n",
    "        conf['docking']['docking_runs'][0]['parameters']['binary_location'] = str(vina_binary_location)\n",
    "        conf['docking']['docking_runs'][0]['parameters']['receptor_pdbqt_path'] = [str(pdbqt_path)]\n",
    "\n",
    "        # SETUP DOCKIG PARAMS\n",
    "        conf['docking']['docking_runs'][0]['parameters']['parallelization']['number_cores'] = number_cores\n",
    "        conf['docking']['docking_runs'][0]['number_poses'] = num_poses\n",
    "        conf['docking']['docking_runs'][0]['exhaustiveness'] = exhaustiveness\n",
    "        \n",
    "        # SETUP BOX\n",
    "        X_min, X_max, Y_min, Y_max, Z_min, Z_max = get_box_params(output_dir / f\"{gene}_target_prep.log\")\n",
    "        conf['docking']['docking_runs'][0]['parameters']['search_space']['--center_x'] = (X_max + X_min) / 2\n",
    "        conf['docking']['docking_runs'][0]['parameters']['search_space']['--center_y'] = (Y_max + Y_min) / 2\n",
    "        conf['docking']['docking_runs'][0]['parameters']['search_space']['--center_z'] = (Z_max + Z_min) / 2\n",
    "        conf['docking']['docking_runs'][0]['parameters']['search_space']['--size_x'] = max((X_max - X_min) + delta_X*2, box_min_size)\n",
    "        conf['docking']['docking_runs'][0]['parameters']['search_space']['--size_y'] = max((Y_max - Y_min) + delta_Y*2, box_min_size)\n",
    "        conf['docking']['docking_runs'][0]['parameters']['search_space']['--size_z'] = max((Z_max - Z_min) + delta_Z*2, box_min_size)\n",
    "        \n",
    "        # change smiles input\n",
    "        smiles_input = conf['docking']['ligand_preparation']['embedding_pools'][0]['input']\n",
    "        # smiles_input['type'] = 'sdf'\n",
    "        smiles_input['input_path'] = str(ligand_input_csv)\n",
    "        conf['docking']['ligand_preparation']['embedding_pools'][0]['input'] = smiles_input\n",
    "        \n",
    "        # save updated version in output_dir dir\n",
    "        with open(config_tmp_dir, 'wt') as f:\n",
    "            json.dump(conf,  f, indent=2)\n",
    "        !{sys.executable} {docker} -conf {config_tmp_dir} -print_scores\n",
    "\n",
    "        # copy scores to results folder\n",
    "        os.makedirs(results_path / dataset, exist_ok=True)\n",
    "        ligands_results_path = results_path / dataset / f\"{gene}_scores.csv\"\n",
    "        if ligands_scores_path.exists():\n",
    "            shutil.copy(ligands_scores_path, ligands_results_path)\n",
    "        else:\n",
    "            print(f\"Missing Vina scores for {dataset} {gene}\")\n",
    "            "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8afcd1cd-6691-47e4-9029-03ada6235033",
   "metadata": {},
   "source": [
    "## Load scores"
   ]
  },
  {
   "cell_type": "code",
   "id": "1dd78529-bdfb-403b-9513-c9c7809406a1",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-01T14:15:10.160020Z",
     "start_time": "2025-07-01T14:15:10.020788Z"
    }
   },
   "source": [
    "scores = {}\n",
    "for dataset in datasets:\n",
    "    output_dataset_dir = results_path / dataset\n",
    "    all_scores = list(output_dataset_dir.glob('*_scores.csv'))\n",
    "    if len(all_scores) == 0:\n",
    "        print(f\"No dockig results for {dataset}\")\n",
    "        continue\n",
    "    else:\n",
    "        print(f\"Found {len(all_scores)} targets for {dataset}\")\n",
    "\n",
    "    df_ = pd.concat([pd.read_csv(path) for path in all_scores])\n",
    "    df_ = df_[df_.lowest_conformer]\n",
    "    df_['dataset'] = dataset\n",
    "    df_ = df_.drop(columns=['ligand_number', 'enumeration', 'conformer_number', 'lowest_conformer'])\n",
    "    df_['gene_id'] = [x.split('_')[0] for x in df_['name']]\n",
    "    scores[dataset] = df_"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 12 targets for bindingdb\n",
      "Found 12 targets for bindingdb_active\n",
      "Found 12 targets for protobind_diff\n",
      "Found 12 targets for pocket2mol\n",
      "Found 12 targets for pocketflow\n",
      "Found 12 targets for targetdiff\n",
      "Found 12 targets for reinvent\n",
      "Found 12 targets for tamgen\n"
     ]
    }
   ],
   "execution_count": 5
  },
  {
   "cell_type": "code",
   "id": "0bdbc8a1-d716-4a7d-a333-efcf2b54976e",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-01T14:15:12.243600Z",
     "start_time": "2025-07-01T14:15:12.174318Z"
    }
   },
   "source": [
    "for dataset, df_ in scores.items():\n",
    "    print(dataset.upper())\n",
    "    display(df_.groupby('gene_id')['score'].agg(['mean', 'sem', 'count']))"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BINDINGDB\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "             mean       sem  count\n",
       "gene_id                           \n",
       "CCR9    -6.194072  0.677425     97\n",
       "ESR1    -7.913155  0.096475     97\n",
       "FTO     -8.388021  0.101775     97\n",
       "GRIK1    0.609660  0.974853     97\n",
       "HCRTR1  -8.696247  0.523158     97\n",
       "IDH1    -8.766031  0.102751     97\n",
       "JAK1    -7.921918  0.423180     97\n",
       "KDM1A   -8.407258  0.110929     97\n",
       "NR4A1   -7.118165  0.079720     97\n",
       "P2RX3   -8.210567  0.101920     97\n",
       "RIOK1   -8.302124  0.096831     97\n",
       "SPIN1   -8.250660  0.103550     97"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sem</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>-6.194072</td>\n",
       "      <td>0.677425</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>-7.913155</td>\n",
       "      <td>0.096475</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>-8.388021</td>\n",
       "      <td>0.101775</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>0.609660</td>\n",
       "      <td>0.974853</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>-8.696247</td>\n",
       "      <td>0.523158</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>-8.766031</td>\n",
       "      <td>0.102751</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>-7.921918</td>\n",
       "      <td>0.423180</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>-8.407258</td>\n",
       "      <td>0.110929</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>-7.118165</td>\n",
       "      <td>0.079720</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>-8.210567</td>\n",
       "      <td>0.101920</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>-8.302124</td>\n",
       "      <td>0.096831</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>-8.250660</td>\n",
       "      <td>0.103550</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BINDINGDB_ACTIVE\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "             mean       sem  count\n",
       "gene_id                           \n",
       "CCR9    -8.505871  0.244808     85\n",
       "ESR1    -8.936520  0.113297    100\n",
       "FTO     -8.085516  0.170518     31\n",
       "GRIK1   -5.091691  0.301431     97\n",
       "HCRTR1  -9.929280  0.073859    100\n",
       "IDH1    -8.868953  0.102966     86\n",
       "JAK1    -8.980592  0.098375     98\n",
       "KDM1A   -8.460949  0.091584     99\n",
       "NR4A1   -6.643400  0.236848     20\n",
       "P2RX3   -8.455567  0.077826     97\n",
       "RIOK1   -8.977214  0.202537     14\n",
       "SPIN1   -7.526375  0.361148      8"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sem</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>-8.505871</td>\n",
       "      <td>0.244808</td>\n",
       "      <td>85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>-8.936520</td>\n",
       "      <td>0.113297</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>-8.085516</td>\n",
       "      <td>0.170518</td>\n",
       "      <td>31</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>-5.091691</td>\n",
       "      <td>0.301431</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>-9.929280</td>\n",
       "      <td>0.073859</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>-8.868953</td>\n",
       "      <td>0.102966</td>\n",
       "      <td>86</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>-8.980592</td>\n",
       "      <td>0.098375</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>-8.460949</td>\n",
       "      <td>0.091584</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>-6.643400</td>\n",
       "      <td>0.236848</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>-8.455567</td>\n",
       "      <td>0.077826</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>-8.977214</td>\n",
       "      <td>0.202537</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>-7.526375</td>\n",
       "      <td>0.361148</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "PROTOBIND_DIFF\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "             mean       sem  count\n",
       "gene_id                           \n",
       "CCR9    -8.315850  0.175044    100\n",
       "ESR1    -8.276194  0.099805     98\n",
       "FTO     -8.087071  0.094492     99\n",
       "GRIK1   -6.678694  0.144869     98\n",
       "HCRTR1  -9.676930  0.100524     57\n",
       "IDH1    -8.894373  0.109403     83\n",
       "JAK1    -8.557585  0.097825     82\n",
       "KDM1A   -8.058351  0.082570     97\n",
       "NR4A1   -6.945650  0.061885    100\n",
       "P2RX3   -8.219634  0.071894     71\n",
       "RIOK1   -8.345510  0.098035    100\n",
       "SPIN1   -7.554130  0.092709    100"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sem</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>-8.315850</td>\n",
       "      <td>0.175044</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>-8.276194</td>\n",
       "      <td>0.099805</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>-8.087071</td>\n",
       "      <td>0.094492</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>-6.678694</td>\n",
       "      <td>0.144869</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>-9.676930</td>\n",
       "      <td>0.100524</td>\n",
       "      <td>57</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>-8.894373</td>\n",
       "      <td>0.109403</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>-8.557585</td>\n",
       "      <td>0.097825</td>\n",
       "      <td>82</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>-8.058351</td>\n",
       "      <td>0.082570</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>-6.945650</td>\n",
       "      <td>0.061885</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>-8.219634</td>\n",
       "      <td>0.071894</td>\n",
       "      <td>71</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>-8.345510</td>\n",
       "      <td>0.098035</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>-7.554130</td>\n",
       "      <td>0.092709</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "POCKET2MOL\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "             mean       sem  count\n",
       "gene_id                           \n",
       "CCR9    -8.950320  0.187505    100\n",
       "ESR1    -9.084692  0.190009     65\n",
       "FTO     -8.535210  0.110165    100\n",
       "GRIK1   -4.936230  0.294365    100\n",
       "HCRTR1  -9.222250  0.194448     72\n",
       "IDH1    -8.704434  0.143805     83\n",
       "JAK1    -8.349143  0.179979     42\n",
       "KDM1A   -8.916430  0.147026    100\n",
       "NR4A1   -7.027020  0.072466    100\n",
       "P2RX3   -7.304812  0.082265     96\n",
       "RIOK1   -8.704025  0.160235     81\n",
       "SPIN1   -9.610242  0.164028     99"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sem</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>-8.950320</td>\n",
       "      <td>0.187505</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>-9.084692</td>\n",
       "      <td>0.190009</td>\n",
       "      <td>65</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>-8.535210</td>\n",
       "      <td>0.110165</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>-4.936230</td>\n",
       "      <td>0.294365</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>-9.222250</td>\n",
       "      <td>0.194448</td>\n",
       "      <td>72</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>-8.704434</td>\n",
       "      <td>0.143805</td>\n",
       "      <td>83</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>-8.349143</td>\n",
       "      <td>0.179979</td>\n",
       "      <td>42</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>-8.916430</td>\n",
       "      <td>0.147026</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>-7.027020</td>\n",
       "      <td>0.072466</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>-7.304812</td>\n",
       "      <td>0.082265</td>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>-8.704025</td>\n",
       "      <td>0.160235</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>-9.610242</td>\n",
       "      <td>0.164028</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "POCKETFLOW\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "             mean       sem  count\n",
       "gene_id                           \n",
       "CCR9    -7.994480  0.152885    100\n",
       "ESR1    -8.225232  0.125721     99\n",
       "FTO     -7.754010  0.168313     99\n",
       "GRIK1   -6.654175  0.234875     97\n",
       "HCRTR1  -7.855420  0.152436    100\n",
       "IDH1    -9.083450  0.180037    100\n",
       "JAK1    -8.503670  0.147948     97\n",
       "KDM1A   -7.634780  0.139804    100\n",
       "NR4A1   -6.268465  0.128386     99\n",
       "P2RX3   -6.924192  0.136626     99\n",
       "RIOK1   -7.930691  0.145455     97\n",
       "SPIN1   -8.839380  0.143852    100"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sem</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>-7.994480</td>\n",
       "      <td>0.152885</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>-8.225232</td>\n",
       "      <td>0.125721</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>-7.754010</td>\n",
       "      <td>0.168313</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>-6.654175</td>\n",
       "      <td>0.234875</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>-7.855420</td>\n",
       "      <td>0.152436</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>-9.083450</td>\n",
       "      <td>0.180037</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>-8.503670</td>\n",
       "      <td>0.147948</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>-7.634780</td>\n",
       "      <td>0.139804</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>-6.268465</td>\n",
       "      <td>0.128386</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>-6.924192</td>\n",
       "      <td>0.136626</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>-7.930691</td>\n",
       "      <td>0.145455</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>-8.839380</td>\n",
       "      <td>0.143852</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TARGETDIFF\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "                  mean            sem  count\n",
       "gene_id                                     \n",
       "CCR9         -7.402900       0.280331    100\n",
       "ESR1         -7.994561       0.108191     98\n",
       "FTO          -7.350847       0.096630     98\n",
       "GRIK1         2.124960       0.933148     99\n",
       "HCRTR1       -8.301740       0.101299    100\n",
       "IDH1         -8.670794       0.093408     97\n",
       "JAK1     309439.718052  309447.153989     97\n",
       "KDM1A    315626.309283  315633.549467     99\n",
       "NR4A1        -6.141070       0.064004    100\n",
       "P2RX3        -7.325186       0.057403     97\n",
       "RIOK1        -7.922969       0.077713     98\n",
       "SPIN1        -7.393520       0.096267     98"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sem</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>-7.402900</td>\n",
       "      <td>0.280331</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>-7.994561</td>\n",
       "      <td>0.108191</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>-7.350847</td>\n",
       "      <td>0.096630</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>2.124960</td>\n",
       "      <td>0.933148</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>-8.301740</td>\n",
       "      <td>0.101299</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>-8.670794</td>\n",
       "      <td>0.093408</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>309439.718052</td>\n",
       "      <td>309447.153989</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>315626.309283</td>\n",
       "      <td>315633.549467</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>-6.141070</td>\n",
       "      <td>0.064004</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>-7.325186</td>\n",
       "      <td>0.057403</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>-7.922969</td>\n",
       "      <td>0.077713</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>-7.393520</td>\n",
       "      <td>0.096267</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "REINVENT\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "              mean       sem  count\n",
       "gene_id                            \n",
       "CCR9     -9.259340  0.088934    100\n",
       "ESR1     -8.656490  0.081679    100\n",
       "FTO      -7.795570  0.074416    100\n",
       "GRIK1    -6.108071  0.118017     99\n",
       "HCRTR1  -10.082460  0.075849     87\n",
       "IDH1     -9.750770  0.055972    100\n",
       "JAK1     -8.678620  0.080723    100\n",
       "KDM1A    -8.202500  0.073571    100\n",
       "NR4A1    -6.399660  0.056968    100\n",
       "P2RX3    -7.882130  0.055984    100\n",
       "RIOK1    -8.366920  0.066088    100\n",
       "SPIN1    -7.697390  0.073662    100"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sem</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>-9.259340</td>\n",
       "      <td>0.088934</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>-8.656490</td>\n",
       "      <td>0.081679</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>-7.795570</td>\n",
       "      <td>0.074416</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>-6.108071</td>\n",
       "      <td>0.118017</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>-10.082460</td>\n",
       "      <td>0.075849</td>\n",
       "      <td>87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>-9.750770</td>\n",
       "      <td>0.055972</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>-8.678620</td>\n",
       "      <td>0.080723</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>-8.202500</td>\n",
       "      <td>0.073571</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>-6.399660</td>\n",
       "      <td>0.056968</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>-7.882130</td>\n",
       "      <td>0.055984</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>-8.366920</td>\n",
       "      <td>0.066088</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>-7.697390</td>\n",
       "      <td>0.073662</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TAMGEN\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "            mean       sem  count\n",
       "gene_id                          \n",
       "CCR9    -8.30861  0.126066    100\n",
       "ESR1    -6.48076  0.084315    100\n",
       "FTO     -6.80634  0.080060    100\n",
       "GRIK1   -3.11875  0.294150    100\n",
       "HCRTR1  -7.42286  0.106647    100\n",
       "IDH1    -6.19107  0.091012    100\n",
       "JAK1    -6.37932  0.081604    100\n",
       "KDM1A   -7.56468  0.087049    100\n",
       "NR4A1   -6.68033  0.075941    100\n",
       "P2RX3   -7.69741  0.107883    100\n",
       "RIOK1   -8.02881  0.116056    100\n",
       "SPIN1   -7.54256  0.106270    100"
      ],
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>mean</th>\n",
       "      <th>sem</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gene_id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CCR9</th>\n",
       "      <td>-8.30861</td>\n",
       "      <td>0.126066</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ESR1</th>\n",
       "      <td>-6.48076</td>\n",
       "      <td>0.084315</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>FTO</th>\n",
       "      <td>-6.80634</td>\n",
       "      <td>0.080060</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>GRIK1</th>\n",
       "      <td>-3.11875</td>\n",
       "      <td>0.294150</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>HCRTR1</th>\n",
       "      <td>-7.42286</td>\n",
       "      <td>0.106647</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>IDH1</th>\n",
       "      <td>-6.19107</td>\n",
       "      <td>0.091012</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>JAK1</th>\n",
       "      <td>-6.37932</td>\n",
       "      <td>0.081604</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>KDM1A</th>\n",
       "      <td>-7.56468</td>\n",
       "      <td>0.087049</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>NR4A1</th>\n",
       "      <td>-6.68033</td>\n",
       "      <td>0.075941</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>P2RX3</th>\n",
       "      <td>-7.69741</td>\n",
       "      <td>0.107883</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>RIOK1</th>\n",
       "      <td>-8.02881</td>\n",
       "      <td>0.116056</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>SPIN1</th>\n",
       "      <td>-7.54256</td>\n",
       "      <td>0.106270</td>\n",
       "      <td>100</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "execution_count": 6
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
