{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, time\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "from joblib import Parallel, delayed\n",
    "from ogb.utils.url import download_url, extract_zip\n",
    "\n",
    "from molecules.physics import generate_physics_dict_structure\n",
    "from molecules.parse_sdf import sdf_to_mols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "USE_MICRO_DATASET = False # Set to false to use the FULL dataset, otherwise a dummy smaller dataset\n",
    "ROOT = \".\"\n",
    "micro_name = \"_micro\" if USE_MICRO_DATASET else \"\"\n",
    "SDF_URL = f\"https://datasets-public-research.s3.us-east-2.amazonaws.com/PCQM4M/pcqm4m-v2_sdf{micro_name}.zip\"\n",
    "SDF_LOCAL_ZIP = f\"{ROOT}/pcqm4m-v2_sdf{micro_name}.zip\"\n",
    "SDF_LOCAL = os.path.join(os.path.splitext(SDF_LOCAL_ZIP)[0])\n",
    "RAW_URL = \"http://ogb-data.stanford.edu/data/lsc/pcqm4m-v2.zip\"\n",
    "RAW_CSV = f\"{ROOT}/pcqm4m-v2/raw/data.csv.gz\"\n",
    "SKIP_DOWNLOAD = True # Set to false for downloading and unzipping the file. If already downloaded, set to True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not SKIP_DOWNLOAD:\n",
    "    # This can take many minutes\n",
    "    path = download_url(SDF_URL, ROOT)\n",
    "    extract_zip(path, ROOT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not SKIP_DOWNLOAD:\n",
    "    raw_path = download_url(RAW_URL, ROOT)\n",
    "    extract_zip(raw_path, ROOT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_files_from_dir(dir, ext):\n",
    "    found_files = []\n",
    "    all_dirs = os.walk(dir)\n",
    "    for root, dirs, files in all_dirs:\n",
    "        for filename in files:\n",
    "            if filename.endswith(ext):\n",
    "                found_files.append(os.path.join(root, filename))\n",
    "    return found_files\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['./pcqm4m-v2_sdf/00250000_00259999/252506.sdf', './pcqm4m-v2_sdf/00250000_00259999/253520.sdf', './pcqm4m-v2_sdf/00250000_00259999/253300.sdf', './pcqm4m-v2_sdf/00250000_00259999/259104.sdf', './pcqm4m-v2_sdf/00250000_00259999/256366.sdf', './pcqm4m-v2_sdf/00250000_00259999/254181.sdf', './pcqm4m-v2_sdf/00250000_00259999/256379.sdf', './pcqm4m-v2_sdf/00250000_00259999/252575.sdf', './pcqm4m-v2_sdf/00250000_00259999/255187.sdf', './pcqm4m-v2_sdf/00250000_00259999/257670.sdf']\n"
     ]
    }
   ],
   "source": [
    "sdf_files = get_files_from_dir(SDF_LOCAL, \".sdf\")\n",
    "time.sleep(1)\n",
    "print(sdf_files[:10])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|          | 24142/3378606 [00:09<20:11, 2768.56it/s]RDKit WARNING: [20:35:08] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "  1%|          | 24429/3378606 [00:09<19:59, 2797.44it/s][20:35:08] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "  6%|▌         | 210874/3378606 [01:17<18:56, 2786.49it/s]RDKit WARNING: [20:36:17] WARNING: not removing hydrogen atom without neighbors\n",
      "[20:36:17] WARNING: not removing hydrogen atom without neighbors\n",
      "RDKit WARNING: [20:36:17] WARNING: not removing hydrogen atom without neighbors\n",
      "RDKit WARNING: [20:36:17] WARNING: not removing hydrogen atom without neighbors\n",
      "[20:36:17] WARNING: not removing hydrogen atom without neighbors\n",
      "[20:36:17] WARNING: not removing hydrogen atom without neighbors\n",
      "  7%|▋         | 222041/3378606 [01:21<18:59, 2770.51it/s]RDKit WARNING: [20:36:21] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:36:21] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 24%|██▍       | 826602/3378606 [05:06<14:58, 2840.59it/s]RDKit WARNING: [20:40:05] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:40:05] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 27%|██▋       | 905188/3378606 [05:35<14:34, 2827.69it/s]RDKit WARNING: [20:40:34] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:40:34] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 36%|███▌      | 1199765/3378606 [07:24<13:01, 2787.78it/s]RDKit WARNING: [20:42:23] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:42:23] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 36%|███▌      | 1210598/3378606 [07:28<12:29, 2891.20it/s]RDKit WARNING: [20:42:27] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:42:27] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 38%|███▊      | 1297981/3378606 [07:59<12:17, 2822.96it/s]RDKit WARNING: [20:42:59] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:42:59] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 61%|██████    | 2064278/3378606 [12:47<07:58, 2748.91it/s]RDKit WARNING: [20:47:46] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:47:46] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 68%|██████▊   | 2305662/3378606 [14:15<06:23, 2800.91it/s]RDKit WARNING: [20:49:15] WARNING: not removing hydrogen atom without neighbors\n",
      "RDKit WARNING: [20:49:15] WARNING: not removing hydrogen atom without neighbors\n",
      "RDKit WARNING: [20:49:15] WARNING: not removing hydrogen atom without neighbors\n",
      " 68%|██████▊   | 2305944/3378606 [14:15<06:22, 2805.20it/s][20:49:15] WARNING: not removing hydrogen atom without neighbors\n",
      "[20:49:15] WARNING: not removing hydrogen atom without neighbors\n",
      "[20:49:15] WARNING: not removing hydrogen atom without neighbors\n",
      " 70%|███████   | 2370255/3378606 [14:39<05:44, 2928.41it/s]RDKit WARNING: [20:49:38] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:49:38] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 79%|███████▉  | 2662183/3378606 [16:28<04:05, 2923.49it/s]RDKit WARNING: [20:51:27] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:51:27] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 79%|███████▉  | 2679504/3378606 [16:34<03:58, 2932.59it/s]RDKit WARNING: [20:51:33] WARNING: not removing hydrogen atom without neighbors\n",
      "RDKit WARNING: [20:51:33] WARNING: not removing hydrogen atom without neighbors\n",
      "[20:51:33] WARNING: not removing hydrogen atom without neighbors\n",
      "[20:51:33] WARNING: not removing hydrogen atom without neighbors\n",
      " 80%|████████  | 2710088/3378606 [16:45<03:36, 3083.00it/s]RDKit WARNING: [20:51:44] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:51:44] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 92%|█████████▏| 3124086/3378606 [19:18<01:31, 2777.96it/s]RDKit WARNING: [20:54:17] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      "[20:54:17] Warning: molecule is tagged as 3D, but all Z coords are zero\n",
      " 93%|█████████▎| 3130142/3378606 [19:20<01:26, 2874.31it/s]RDKit WARNING: [20:54:19] Conflicting single bond directions around double bond at index 4.\n",
      "RDKit WARNING: [20:54:19]   BondStereo set to STEREONONE and single bond directions set to NONE.\n",
      "[20:54:19] Conflicting single bond directions around double bond at index 4.\n",
      "[20:54:19]   BondStereo set to STEREONONE and single bond directions set to NONE.\n",
      "100%|██████████| 3378606/3378606 [20:53<00:00, 2695.41it/s]\n"
     ]
    }
   ],
   "source": [
    "mols = {}\n",
    "for file in tqdm(sdf_files):\n",
    "    try:\n",
    "        idx = int(os.path.basename(os.path.splitext(file)[0]))\n",
    "        mols[idx] = sdf_to_mols(file)[0]\n",
    "    except Exception as e:\n",
    "        print(f\"Error loading file {file}\")\n",
    "        pass"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  1%|          | 32704/3378606 [00:11<03:47, 14696.78it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2774781 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  3%|▎         | 98240/3378606 [00:15<03:01, 18043.55it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3342150 SMILES=N\n",
      "Failed for molecule idx=3347837 SMILES=O\n",
      "Failed for molecule idx=3349913 SMILES=O\n",
      "Failed for molecule idx=3348987 SMILES=P\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  3%|▎         | 106432/3378606 [00:16<03:03, 17823.61it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3342147 SMILES=Br\n",
      "Failed for molecule idx=3347108 SMILES=O\n",
      "Failed for molecule idx=3346961 SMILES=C\n",
      "Failed for molecule idx=3341898 SMILES=Cl\n",
      "Failed for molecule idx=3346957 SMILES=Br\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  5%|▍         | 159680/3378606 [00:19<03:03, 17569.17it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1743406 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  5%|▍         | 163776/3378606 [00:19<02:58, 17988.87it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1749315 SMILES=P\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  6%|▌         | 192448/3378606 [00:20<02:48, 18897.95it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1646655 SMILES=S\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  6%|▌         | 196544/3378606 [00:21<02:55, 18137.11it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2278265 SMILES=Cl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  6%|▌         | 200640/3378606 [00:21<02:58, 17837.02it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2278189 SMILES=N\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  9%|▉         | 303040/3378606 [00:26<02:48, 18253.10it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1711137 SMILES=N\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  9%|▉         | 307136/3378606 [00:27<02:48, 18200.41it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1711136 SMILES=N\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 12%|█▏        | 389056/3378606 [00:32<02:49, 17604.87it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1857468 SMILES=F\n",
      "Failed for molecule idx=1853781 SMILES=P\n",
      "Failed for molecule idx=1855419 SMILES=P\n",
      "Failed for molecule idx=1858155 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 12%|█▏        | 393152/3378606 [00:32<02:46, 17974.53it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1857471 SMILES=N\n",
      "Failed for molecule idx=1857470 SMILES=F\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 12%|█▏        | 397248/3378606 [00:32<02:36, 19042.59it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1857469 SMILES=F\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 12%|█▏        | 417728/3378606 [00:33<02:43, 18142.35it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3366711 SMILES=S\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 12%|█▏        | 421824/3378606 [00:34<02:49, 17406.71it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3362848 SMILES=S\n",
      "Failed for molecule idx=3364037 SMILES=F\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 15%|█▍        | 491456/3378606 [00:38<02:44, 17503.86it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1752280 SMILES=C\n",
      "Failed for molecule idx=1755906 SMILES=Br\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 16%|█▌        | 540608/3378606 [00:40<02:39, 17827.55it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3355884 SMILES=F\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 18%|█▊        | 597952/3378606 [00:44<02:35, 17862.47it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3144495 SMILES=F\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 23%|██▎       | 774080/3378606 [00:54<02:27, 17656.45it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2904023 SMILES=S\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 24%|██▍       | 815040/3378606 [00:56<02:21, 18070.92it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=85975 SMILES=O\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 26%|██▌       | 876480/3378606 [01:00<02:24, 17264.20it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3286940 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 26%|██▌       | 880576/3378606 [01:00<02:24, 17321.76it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3286931 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 27%|██▋       | 913344/3378606 [01:02<02:21, 17450.70it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2932345 SMILES=Cl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 28%|██▊       | 937920/3378606 [01:03<02:18, 17671.06it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3159107 SMILES=[SiH4]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 37%|███▋      | 1261504/3378606 [01:22<01:52, 18857.97it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1818416 SMILES=N\n",
      "Failed for molecule idx=1812920 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 38%|███▊      | 1298368/3378606 [01:24<01:56, 17911.52it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2980688 SMILES=N\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 39%|███▊      | 1306560/3378606 [01:24<01:52, 18357.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2980855 SMILES=F\n",
      "Failed for molecule idx=2984530 SMILES=C\n",
      "Failed for molecule idx=2985105 SMILES=S\n",
      "Failed for molecule idx=2985102 SMILES=O\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 41%|████      | 1376192/3378606 [01:28<01:53, 17705.27it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2265330 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 41%|████▏     | 1396672/3378606 [01:29<01:51, 17811.62it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1736514 SMILES=P\n",
      "Failed for molecule idx=1730492 SMILES=P\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 42%|████▏     | 1404864/3378606 [01:30<01:48, 18109.16it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1736513 SMILES=P\n",
      "Failed for molecule idx=1734210 SMILES=Cl\n",
      "Failed for molecule idx=1734544 SMILES=[NH]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 46%|████▌     | 1560512/3378606 [01:39<01:49, 16625.65it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3117773 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 57%|█████▋    | 1941440/3378606 [02:01<02:14, 10660.95it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1827685 SMILES=Cl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 63%|██████▎   | 2138048/3378606 [02:12<01:09, 17880.13it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3175805 SMILES=N\n",
      "Failed for molecule idx=3176384 SMILES=C\n",
      "Failed for molecule idx=3176385 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 63%|██████▎   | 2142144/3378606 [02:12<01:08, 18141.98it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3176386 SMILES=C\n",
      "Failed for molecule idx=3176383 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 65%|██████▍   | 2191296/3378606 [02:15<01:02, 18859.25it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=137779 SMILES=P\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 65%|██████▌   | 2211776/3378606 [02:16<01:00, 19136.32it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3254373 SMILES=[BH]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 70%|███████   | 2379712/3378606 [02:26<00:55, 18099.20it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2991141 SMILES=Cl\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 77%|███████▋  | 2592704/3378606 [02:38<00:47, 16497.95it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3208582 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 78%|███████▊  | 2645952/3378606 [02:41<00:39, 18756.99it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3274341 SMILES=C\n",
      "Failed for molecule idx=3274651 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 80%|███████▉  | 2691008/3378606 [02:43<00:36, 19010.78it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1768820 SMILES=P\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 84%|████████▍ | 2846656/3378606 [02:53<00:31, 16902.85it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1662286 SMILES=C\n",
      "Failed for molecule idx=1662363 SMILES=P\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 84%|████████▍ | 2854848/3378606 [02:53<00:29, 17834.56it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1665176 SMILES=S\n",
      "Failed for molecule idx=1664201 SMILES=C\n",
      "Failed for molecule idx=1664472 SMILES=C\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 85%|████████▍ | 2858944/3378606 [02:53<00:28, 18168.65it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1665177 SMILES=S\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 87%|████████▋ | 2944960/3378606 [02:58<00:22, 19224.93it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=3138019 SMILES=O\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 88%|████████▊ | 2961344/3378606 [02:59<00:22, 18372.98it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1773339 SMILES=[BH]\n",
      "Failed for molecule idx=1773342 SMILES=[NH]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 92%|█████████▏| 3116992/3378606 [03:08<00:15, 17243.38it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2627608 SMILES=O\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 93%|█████████▎| 3155694/3378606 [03:10<00:22, 9743.05it/s] "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=2633315 SMILES=Br\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 95%|█████████▌| 3219392/3378606 [03:14<00:08, 18201.75it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1693988 SMILES=N\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 97%|█████████▋| 3276736/3378606 [03:17<00:05, 18202.14it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1838508 SMILES=N\n",
      "Failed for molecule idx=1831559 SMILES=P\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 97%|█████████▋| 3289024/3378606 [03:18<00:04, 18092.59it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Failed for molecule idx=1841135 SMILES=S\n",
      "Failed for molecule idx=1849751 SMILES=[NH]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 3378606/3378606 [03:23<00:00, 16637.40it/s]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>I_a</th>\n",
       "      <th>I_b</th>\n",
       "      <th>I_c</th>\n",
       "      <th>IH_a</th>\n",
       "      <th>IH_b</th>\n",
       "      <th>IH_c</th>\n",
       "      <th>len_a</th>\n",
       "      <th>len_b</th>\n",
       "      <th>len_c</th>\n",
       "      <th>idx</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.801022</td>\n",
       "      <td>1.205763</td>\n",
       "      <td>0.703477</td>\n",
       "      <td>3.332717</td>\n",
       "      <td>2.230740</td>\n",
       "      <td>1.277720</td>\n",
       "      <td>0.785694</td>\n",
       "      <td>0.564729</td>\n",
       "      <td>0.180997</td>\n",
       "      <td>252506</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.848249</td>\n",
       "      <td>0.668207</td>\n",
       "      <td>0.180042</td>\n",
       "      <td>1.054542</td>\n",
       "      <td>0.906995</td>\n",
       "      <td>0.147547</td>\n",
       "      <td>0.781616</td>\n",
       "      <td>0.382377</td>\n",
       "      <td>0.000281</td>\n",
       "      <td>253520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.723672</td>\n",
       "      <td>1.429719</td>\n",
       "      <td>0.451579</td>\n",
       "      <td>2.734154</td>\n",
       "      <td>2.413071</td>\n",
       "      <td>0.522670</td>\n",
       "      <td>0.860926</td>\n",
       "      <td>0.523788</td>\n",
       "      <td>0.239813</td>\n",
       "      <td>253300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.479145</td>\n",
       "      <td>1.422294</td>\n",
       "      <td>0.184963</td>\n",
       "      <td>2.109702</td>\n",
       "      <td>2.044756</td>\n",
       "      <td>0.159232</td>\n",
       "      <td>0.976637</td>\n",
       "      <td>0.314264</td>\n",
       "      <td>0.220982</td>\n",
       "      <td>259104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.799458</td>\n",
       "      <td>0.749364</td>\n",
       "      <td>0.148133</td>\n",
       "      <td>0.601163</td>\n",
       "      <td>0.524731</td>\n",
       "      <td>0.162983</td>\n",
       "      <td>0.673110</td>\n",
       "      <td>0.281897</td>\n",
       "      <td>0.213451</td>\n",
       "      <td>256366</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378521</th>\n",
       "      <td>2.039659</td>\n",
       "      <td>2.003402</td>\n",
       "      <td>0.355868</td>\n",
       "      <td>1.830906</td>\n",
       "      <td>1.700403</td>\n",
       "      <td>0.312575</td>\n",
       "      <td>1.006012</td>\n",
       "      <td>0.353471</td>\n",
       "      <td>0.361731</td>\n",
       "      <td>1175534</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378522</th>\n",
       "      <td>2.090909</td>\n",
       "      <td>1.976536</td>\n",
       "      <td>0.304126</td>\n",
       "      <td>4.400455</td>\n",
       "      <td>4.142253</td>\n",
       "      <td>0.713854</td>\n",
       "      <td>1.070125</td>\n",
       "      <td>0.503605</td>\n",
       "      <td>0.283449</td>\n",
       "      <td>1173024</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378523</th>\n",
       "      <td>1.033674</td>\n",
       "      <td>0.909330</td>\n",
       "      <td>0.271496</td>\n",
       "      <td>1.491752</td>\n",
       "      <td>1.324474</td>\n",
       "      <td>0.221520</td>\n",
       "      <td>0.769898</td>\n",
       "      <td>0.373006</td>\n",
       "      <td>0.267498</td>\n",
       "      <td>1177412</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378524</th>\n",
       "      <td>1.218129</td>\n",
       "      <td>1.101899</td>\n",
       "      <td>0.249071</td>\n",
       "      <td>1.317298</td>\n",
       "      <td>1.287553</td>\n",
       "      <td>0.284384</td>\n",
       "      <td>0.800118</td>\n",
       "      <td>0.361007</td>\n",
       "      <td>0.249172</td>\n",
       "      <td>1178625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378525</th>\n",
       "      <td>1.658228</td>\n",
       "      <td>1.597780</td>\n",
       "      <td>0.208511</td>\n",
       "      <td>2.875891</td>\n",
       "      <td>2.799357</td>\n",
       "      <td>0.224468</td>\n",
       "      <td>1.029655</td>\n",
       "      <td>0.277447</td>\n",
       "      <td>0.248887</td>\n",
       "      <td>1179061</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3378526 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              I_a       I_b       I_c      IH_a      IH_b      IH_c     len_a  \\\n",
       "0        1.801022  1.205763  0.703477  3.332717  2.230740  1.277720  0.785694   \n",
       "1        0.848249  0.668207  0.180042  1.054542  0.906995  0.147547  0.781616   \n",
       "2        1.723672  1.429719  0.451579  2.734154  2.413071  0.522670  0.860926   \n",
       "3        1.479145  1.422294  0.184963  2.109702  2.044756  0.159232  0.976637   \n",
       "4        0.799458  0.749364  0.148133  0.601163  0.524731  0.162983  0.673110   \n",
       "...           ...       ...       ...       ...       ...       ...       ...   \n",
       "3378521  2.039659  2.003402  0.355868  1.830906  1.700403  0.312575  1.006012   \n",
       "3378522  2.090909  1.976536  0.304126  4.400455  4.142253  0.713854  1.070125   \n",
       "3378523  1.033674  0.909330  0.271496  1.491752  1.324474  0.221520  0.769898   \n",
       "3378524  1.218129  1.101899  0.249071  1.317298  1.287553  0.284384  0.800118   \n",
       "3378525  1.658228  1.597780  0.208511  2.875891  2.799357  0.224468  1.029655   \n",
       "\n",
       "            len_b     len_c      idx  \n",
       "0        0.564729  0.180997   252506  \n",
       "1        0.382377  0.000281   253520  \n",
       "2        0.523788  0.239813   253300  \n",
       "3        0.314264  0.220982   259104  \n",
       "4        0.281897  0.213451   256366  \n",
       "...           ...       ...      ...  \n",
       "3378521  0.353471  0.361731  1175534  \n",
       "3378522  0.503605  0.283449  1173024  \n",
       "3378523  0.373006  0.267498  1177412  \n",
       "3378524  0.361007  0.249172  1178625  \n",
       "3378525  0.277447  0.248887  1179061  \n",
       "\n",
       "[3378526 rows x 10 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list_struct = Parallel(n_jobs=-1)(delayed(generate_physics_dict_structure)(mol, idx) for idx, mol in tqdm(mols.items()))\n",
    "# list_struct = []\n",
    "# for idx, mol in tqdm(mols.items()):\n",
    "#     list_struct.append(generate_physics_dict_structure(mol, idx))\n",
    "dict_struct = {k: [dic[k] for dic in list_struct if dic is not None] for k in list_struct[0]}\n",
    "df = pd.DataFrame(dict_struct)\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>smiles</th>\n",
       "      <th>homolumogap</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C</td>\n",
       "      <td>3.047675</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>COc1cc(OC)ccc1/C=C/N(C(=O)C)C</td>\n",
       "      <td>4.410966</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>C=CCN(C(=O)C)/C=C/c1ccccc1C</td>\n",
       "      <td>4.639541</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>C=CCN(C(=O)C)/C=C/c1ccccc1F</td>\n",
       "      <td>4.492600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>C=CCN(C(=O)C)/C=C/c1ccccc1Cl</td>\n",
       "      <td>4.612330</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3746615</th>\n",
       "      <td>3746615</td>\n",
       "      <td>CCn1cnc2c1ncnc2N</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3746616</th>\n",
       "      <td>3746616</td>\n",
       "      <td>O=N(=O)c1ccc(c(c1)N(=O)=O)Cl</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3746617</th>\n",
       "      <td>3746617</td>\n",
       "      <td>NCC(=O)COP(=O)(O)O</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3746618</th>\n",
       "      <td>3746618</td>\n",
       "      <td>C[C@@H](CN)O</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3746619</th>\n",
       "      <td>3746619</td>\n",
       "      <td>O[C@H]1C=CC=C([C@@H]1O)C(=O)O</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3746620 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             idx                                 smiles  homolumogap\n",
       "0              0  O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C     3.047675\n",
       "1              1          COc1cc(OC)ccc1/C=C/N(C(=O)C)C     4.410966\n",
       "2              2            C=CCN(C(=O)C)/C=C/c1ccccc1C     4.639541\n",
       "3              3            C=CCN(C(=O)C)/C=C/c1ccccc1F     4.492600\n",
       "4              4           C=CCN(C(=O)C)/C=C/c1ccccc1Cl     4.612330\n",
       "...          ...                                    ...          ...\n",
       "3746615  3746615                       CCn1cnc2c1ncnc2N          NaN\n",
       "3746616  3746616           O=N(=O)c1ccc(c(c1)N(=O)=O)Cl          NaN\n",
       "3746617  3746617                     NCC(=O)COP(=O)(O)O          NaN\n",
       "3746618  3746618                           C[C@@H](CN)O          NaN\n",
       "3746619  3746619          O[C@H]1C=CC=C([C@@H]1O)C(=O)O          NaN\n",
       "\n",
       "[3746620 rows x 3 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_df = pd.read_csv(RAW_CSV)\n",
    "raw_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>idx</th>\n",
       "      <th>smiles</th>\n",
       "      <th>homolumogap</th>\n",
       "      <th>I_a</th>\n",
       "      <th>I_b</th>\n",
       "      <th>I_c</th>\n",
       "      <th>IH_a</th>\n",
       "      <th>IH_b</th>\n",
       "      <th>IH_c</th>\n",
       "      <th>len_a</th>\n",
       "      <th>len_b</th>\n",
       "      <th>len_c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C</td>\n",
       "      <td>3.047675</td>\n",
       "      <td>1.813462</td>\n",
       "      <td>1.601682</td>\n",
       "      <td>0.401664</td>\n",
       "      <td>1.510950</td>\n",
       "      <td>1.436966</td>\n",
       "      <td>0.173702</td>\n",
       "      <td>0.961845</td>\n",
       "      <td>0.452937</td>\n",
       "      <td>0.254786</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>COc1cc(OC)ccc1/C=C/N(C(=O)C)C</td>\n",
       "      <td>4.410966</td>\n",
       "      <td>2.308566</td>\n",
       "      <td>1.932109</td>\n",
       "      <td>0.403518</td>\n",
       "      <td>2.688305</td>\n",
       "      <td>1.975483</td>\n",
       "      <td>0.773145</td>\n",
       "      <td>1.053264</td>\n",
       "      <td>0.561198</td>\n",
       "      <td>0.126349</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>C=CCN(C(=O)C)/C=C/c1ccccc1C</td>\n",
       "      <td>4.639541</td>\n",
       "      <td>1.691103</td>\n",
       "      <td>1.338676</td>\n",
       "      <td>0.398564</td>\n",
       "      <td>1.848389</td>\n",
       "      <td>1.220168</td>\n",
       "      <td>0.714471</td>\n",
       "      <td>0.869384</td>\n",
       "      <td>0.583278</td>\n",
       "      <td>0.161675</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>C=CCN(C(=O)C)/C=C/c1ccccc1F</td>\n",
       "      <td>4.492600</td>\n",
       "      <td>1.753497</td>\n",
       "      <td>1.367217</td>\n",
       "      <td>0.411020</td>\n",
       "      <td>1.552199</td>\n",
       "      <td>1.053055</td>\n",
       "      <td>0.540122</td>\n",
       "      <td>0.875695</td>\n",
       "      <td>0.588322</td>\n",
       "      <td>0.126418</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>C=CCN(C(=O)C)/C=C/c1ccccc1Cl</td>\n",
       "      <td>4.612330</td>\n",
       "      <td>1.872057</td>\n",
       "      <td>1.467153</td>\n",
       "      <td>0.527033</td>\n",
       "      <td>1.661891</td>\n",
       "      <td>1.365148</td>\n",
       "      <td>0.350127</td>\n",
       "      <td>0.891042</td>\n",
       "      <td>0.562013</td>\n",
       "      <td>0.210463</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378521</th>\n",
       "      <td>3378601</td>\n",
       "      <td>Cc1ccc(c(c1)C)N[C@H](/C(=N\\C1CC1)/O)C</td>\n",
       "      <td>5.347037</td>\n",
       "      <td>1.348275</td>\n",
       "      <td>1.288381</td>\n",
       "      <td>0.529860</td>\n",
       "      <td>2.074233</td>\n",
       "      <td>1.926117</td>\n",
       "      <td>0.849037</td>\n",
       "      <td>0.828846</td>\n",
       "      <td>0.462148</td>\n",
       "      <td>0.401456</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378522</th>\n",
       "      <td>3378602</td>\n",
       "      <td>C[C@@H](/C(=N\\C1CC1)/O)Nc1cccc(c1C)C</td>\n",
       "      <td>5.809631</td>\n",
       "      <td>1.667726</td>\n",
       "      <td>1.532551</td>\n",
       "      <td>0.408586</td>\n",
       "      <td>2.339997</td>\n",
       "      <td>1.954943</td>\n",
       "      <td>0.658524</td>\n",
       "      <td>0.886473</td>\n",
       "      <td>0.473849</td>\n",
       "      <td>0.327503</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378523</th>\n",
       "      <td>3378603</td>\n",
       "      <td>C[C@H](/C(=N\\C(=N)O)/O)Nc1cccc(c1C)C</td>\n",
       "      <td>5.064039</td>\n",
       "      <td>1.450947</td>\n",
       "      <td>1.335399</td>\n",
       "      <td>0.385102</td>\n",
       "      <td>1.659290</td>\n",
       "      <td>1.385724</td>\n",
       "      <td>0.449171</td>\n",
       "      <td>0.919293</td>\n",
       "      <td>0.435652</td>\n",
       "      <td>0.331783</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378524</th>\n",
       "      <td>3378604</td>\n",
       "      <td>C[C@@H](/C(=N\\C(=N)O)/O)Nc1cccc(c1C)C</td>\n",
       "      <td>5.336153</td>\n",
       "      <td>1.459332</td>\n",
       "      <td>1.360086</td>\n",
       "      <td>0.370713</td>\n",
       "      <td>1.663606</td>\n",
       "      <td>1.398784</td>\n",
       "      <td>0.437598</td>\n",
       "      <td>0.925686</td>\n",
       "      <td>0.434268</td>\n",
       "      <td>0.347544</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3378525</th>\n",
       "      <td>3378605</td>\n",
       "      <td>CCOc1ccccc1NC/C(=N\\C1CC1)/O</td>\n",
       "      <td>5.420508</td>\n",
       "      <td>1.699425</td>\n",
       "      <td>1.473906</td>\n",
       "      <td>0.427350</td>\n",
       "      <td>2.540095</td>\n",
       "      <td>2.228266</td>\n",
       "      <td>0.472735</td>\n",
       "      <td>0.955590</td>\n",
       "      <td>0.485876</td>\n",
       "      <td>0.306142</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3378526 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             idx                                 smiles  homolumogap  \\\n",
       "0              0  O=C1[N]c2ccncc2[CH][C@@H]1c1ccc(cc1)C     3.047675   \n",
       "1              1          COc1cc(OC)ccc1/C=C/N(C(=O)C)C     4.410966   \n",
       "2              2            C=CCN(C(=O)C)/C=C/c1ccccc1C     4.639541   \n",
       "3              3            C=CCN(C(=O)C)/C=C/c1ccccc1F     4.492600   \n",
       "4              4           C=CCN(C(=O)C)/C=C/c1ccccc1Cl     4.612330   \n",
       "...          ...                                    ...          ...   \n",
       "3378521  3378601  Cc1ccc(c(c1)C)N[C@H](/C(=N\\C1CC1)/O)C     5.347037   \n",
       "3378522  3378602   C[C@@H](/C(=N\\C1CC1)/O)Nc1cccc(c1C)C     5.809631   \n",
       "3378523  3378603   C[C@H](/C(=N\\C(=N)O)/O)Nc1cccc(c1C)C     5.064039   \n",
       "3378524  3378604  C[C@@H](/C(=N\\C(=N)O)/O)Nc1cccc(c1C)C     5.336153   \n",
       "3378525  3378605            CCOc1ccccc1NC/C(=N\\C1CC1)/O     5.420508   \n",
       "\n",
       "              I_a       I_b       I_c      IH_a      IH_b      IH_c     len_a  \\\n",
       "0        1.813462  1.601682  0.401664  1.510950  1.436966  0.173702  0.961845   \n",
       "1        2.308566  1.932109  0.403518  2.688305  1.975483  0.773145  1.053264   \n",
       "2        1.691103  1.338676  0.398564  1.848389  1.220168  0.714471  0.869384   \n",
       "3        1.753497  1.367217  0.411020  1.552199  1.053055  0.540122  0.875695   \n",
       "4        1.872057  1.467153  0.527033  1.661891  1.365148  0.350127  0.891042   \n",
       "...           ...       ...       ...       ...       ...       ...       ...   \n",
       "3378521  1.348275  1.288381  0.529860  2.074233  1.926117  0.849037  0.828846   \n",
       "3378522  1.667726  1.532551  0.408586  2.339997  1.954943  0.658524  0.886473   \n",
       "3378523  1.450947  1.335399  0.385102  1.659290  1.385724  0.449171  0.919293   \n",
       "3378524  1.459332  1.360086  0.370713  1.663606  1.398784  0.437598  0.925686   \n",
       "3378525  1.699425  1.473906  0.427350  2.540095  2.228266  0.472735  0.955590   \n",
       "\n",
       "            len_b     len_c  \n",
       "0        0.452937  0.254786  \n",
       "1        0.561198  0.126349  \n",
       "2        0.583278  0.161675  \n",
       "3        0.588322  0.126418  \n",
       "4        0.562013  0.210463  \n",
       "...           ...       ...  \n",
       "3378521  0.462148  0.401456  \n",
       "3378522  0.473849  0.327503  \n",
       "3378523  0.435652  0.331783  \n",
       "3378524  0.434268  0.347544  \n",
       "3378525  0.485876  0.306142  \n",
       "\n",
       "[3378526 rows x 12 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged_df = raw_df.merge(df, how=\"inner\", on=[\"idx\"])\n",
    "merged_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged_df = merged_df.sort_values(by=\"idx\")\n",
    "csv_path = os.path.join(ROOT, f\"pcqm4m-v2-physics{micro_name}.csv.gz\")\n",
    "merged_df.to_csv(csv_path, index=False)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "906fb3602c4d49c4c01ec4a34d5abe4c7af3a32cf71a93ee4914804593d051b0"
  },
  "kernelspec": {
   "display_name": "Python 3.9.9 ('gtblueprint')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.9"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
