{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import json\n",
    "with open('./cid_set.pkl', 'rb') as f:\n",
    "    cid_set = pickle.load(f)\n",
    "\n",
    "with open('./chemical_nbr2d_.json', 'r') as f:\n",
    "    chemical_nbr2d = json.load(f)\n",
    "\n",
    "with open('./chemical_nbr3d_.json', 'r') as f:\n",
    "    chemical_nbr3d = json.load(f)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "62174"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(cid_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 62174/62174 [49:53<00:00, 20.77it/s]  \n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "import re\n",
    "from collections import defaultdict\n",
    "from tqdm import tqdm\n",
    "\n",
    "nbr_2d = defaultdict(list)\n",
    "nbr_3d = defaultdict(list)\n",
    "\n",
    "i = 0\n",
    "for cid in tqdm(cid_set):\n",
    "    i+=1\n",
    "    if i % 10000 == 0:\n",
    "        with open('./chemical_nbr2d.json', 'w') as f:\n",
    "            json.dump(nbr_2d, f, indent=6)\n",
    "\n",
    "        with open('./chemical_nbr3d.json', 'w') as f:\n",
    "            json.dump(nbr_3d, f, indent=6)\n",
    "\n",
    "    if cid in chemical_nbr2d:\n",
    "        nbr_2d[cid] = chemical_nbr2d[cid]\n",
    "    if cid in chemical_nbr3d:\n",
    "        nbr_3d[cid] = chemical_nbr3d[cid]\n",
    "    \n",
    "    if cid in nbr_2d and cid in nbr_3d:\n",
    "        continue\n",
    "    # Define the API endpoint URL\n",
    "    url = f\"https://pubchem.ncbi.nlm.nih.gov/rest/rdf/compound/{cid}/nbr.html\"\n",
    "\n",
    "    # Query the API and parse the response\n",
    "    response = requests.get(url)\n",
    "    lines = response.text\n",
    "\n",
    "    # Extract the CIDs for 2D similarity\n",
    "    pattern_2d = r'similar to by PubChem 2D similarity algorithm.*?CID(\\d+).*?CID(\\d+).*?CID(\\d+).*?CID(\\d+).*?CID(\\d+)'\n",
    "    matches_2d = re.search(pattern_2d, lines, flags=re.DOTALL)\n",
    "    if matches_2d:\n",
    "        cids_2d = matches_2d.groups()\n",
    "        top_2d = []\n",
    "        for c in cids_2d:\n",
    "            if c not in top_2d and c != cid:\n",
    "                nbr_2d[cid].append(c)\n",
    "            if len(nbr_2d[cid]) == 3:\n",
    "                break\n",
    "\n",
    "    # Extract the CIDs for 3D similarity\n",
    "    pattern_3d = r'similar to by PubChem 3D similarity algorithm.*?CID(\\d+).*?CID(\\d+).*?CID(\\d+).*?CID(\\d+).*?CID(\\d+)'\n",
    "    matches_3d = re.search(pattern_3d, lines, flags=re.DOTALL)\n",
    "    if matches_3d:\n",
    "        cids_3d = matches_3d.groups()\n",
    "        top_3d = []\n",
    "        for c in cids_3d:\n",
    "            if c not in top_3d and c != cid:\n",
    "                nbr_3d[cid].append(c)\n",
    "            if len(nbr_3d[cid]) == 3:\n",
    "                break\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('./chemical_nbr2d.json', 'w') as f:\n",
    "    json.dump(nbr_2d, f, indent=6)\n",
    "\n",
    "with open('./chemical_nbr3d.json', 'w') as f:\n",
    "    json.dump(nbr_3d, f, indent=6)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.16 ('txgnn_env')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "79cb95e61c4f960f4e102f21c45668d32cb5c494b237694c15d64b50342e6e99"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
