{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "def save(triple_set, file_):\n",
    "    out_str = \"\"\n",
    "    for triple in triple_set:\n",
    "        h, r, t = triple\n",
    "        if h != None and r != None and t != None:\n",
    "            out_str += h + '\\t' + r + '\\t' + t +'\\n'\n",
    "\n",
    "    with open(file_, 'w') as f:\n",
    "        f.write(out_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 26%|██▌       | 36025/140000 [2:11:55<4:33:07,  6.34it/s]   "
     ]
    }
   ],
   "source": [
    "import requests\n",
    "from tqdm import tqdm\n",
    "\n",
    "import re\n",
    "\n",
    "def retrive_value(response):\n",
    "    # assuming the HTML response is stored in the variable 'html'\n",
    "    has_value_pattern = r'<span class=\"value\">([\\d.]+)</span>'\n",
    "    has_value_match = re.search(has_value_pattern, response)\n",
    "\n",
    "    if has_value_match:\n",
    "        value = has_value_match.group(1)\n",
    "        return value\n",
    "    else:\n",
    "        return None\n",
    "\n",
    "with open('../compound_descriptor.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "rel_set = set()\n",
    "triple_set = set()\n",
    "for i in tqdm(range(len(lines[980000:1120000]))):\n",
    "    h, r, t = lines[i].split('\\t')\n",
    "    t = t[:-1]\n",
    "    t_query = t.replace('descriptor:', '')\n",
    "    rel = t_query.replace(t_query.split('_')[0]+'_', '')\n",
    "    rel_set.add(rel)\n",
    "\n",
    "    url = f\"https://pubchem.ncbi.nlm.nih.gov/rest/rdf/descriptor/{t_query}.html\"\n",
    "\n",
    "    # Query the API and parse the response\n",
    "    try:\n",
    "        response = requests.get(url).text\n",
    "    except:\n",
    "        continue\n",
    "\n",
    "    value = retrive_value(response)\n",
    "\n",
    "    triple = (h, rel, value)\n",
    "    triple_set.add(triple)\n",
    "\n",
    "    if i % 10000 == 0 and i != 0:\n",
    "        save(triple_set=triple_set, file_='../compound_descriptor_triple_8.txt')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "save(triple_set=triple_set, file_='../compound_descriptor_triple_8.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.16 ('txgnn_env')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "79cb95e61c4f960f4e102f21c45668d32cb5c494b237694c15d64b50342e6e99"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
