{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "from rdflib import Graph, Namespace, Literal\n",
    "from rdflib.plugins.sparql import prepareQuery\n",
    "import glob\n",
    "import gzip\n",
    "from tqdm import tqdm\n",
    "\n",
    "dir_ = '/data/pj20/compound/general/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('./cid_set.pkl', 'rb') as f:\n",
    "    cid_set = pickle.load(f)\n",
    "\n",
    "# import json\n",
    "\n",
    "# smile2cid = {}\n",
    "# with open('./smile2cid.json', 'r') as f:\n",
    "#     smile2cid.update(json.load(f))\n",
    "\n",
    "# icd2smile = {value: key for key, value in smile2cid.items() if value != 'None' and value != None}\n",
    "# part_cid = set()\n",
    "# for i in tqdm(icd2smile.keys()):\n",
    "#     part_cid.add(str(i))\n",
    "\n",
    "# cid_set.update(part_cid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "62174"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(cid_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "from collections import defaultdict\n",
    "\n",
    "def retrieve(triple_strs, relation):\n",
    "    triple_set = set()\n",
    "    t_buffer = defaultdict(list)\n",
    "    h_buffer = defaultdict(list)\n",
    "    for triple_str in tqdm(triple_strs):\n",
    "        if ' ,\\n\\t\\t' in triple_str:\n",
    "            items = triple_str.split(' ,\\n\\t\\t')\n",
    "            hr_t0 = items[0]\n",
    "            h, r, t0 = hr_t0.split('\\t')\n",
    "            h = h.replace('compound:CID', '')\n",
    "            t0 = t0.replace('compound:CID', '')\n",
    "\n",
    "            if h in cid_set and t0 in cid_set:\n",
    "                triple_set.add((h, relation,t0))\n",
    "\n",
    "            elif h in cid_set and t0 not in cid_set:\n",
    "                t_buffer[t0].append(h)\n",
    "            \n",
    "            elif h not in cid_set and t0 in cid_set:\n",
    "                h_buffer[h].append(t0)\n",
    "\n",
    "\n",
    "            tail_num = len(items) - 1\n",
    "            for tail in range(tail_num):\n",
    "                t = items[tail+1].replace('compound:CID', '')\n",
    "                if h in cid_set and t in cid_set:\n",
    "                    triple_set.add((h, relation, t))\n",
    "\n",
    "                elif h in cid_set and t not in cid_set:\n",
    "                    t_buffer[t].append(h)\n",
    "                \n",
    "                elif h not in cid_set and t in cid_set:\n",
    "                    h_buffer[h].append(t)\n",
    "        else:\n",
    "            try:\n",
    "                items = triple_str.split('\\t')\n",
    "                h = items[0].replace('compound:CID', '')\n",
    "                t = items[2].replace('compound:CID', '')\n",
    "                if h in cid_set and t in cid_set:\n",
    "                    triple_set.add((h, relation, t))\n",
    "\n",
    "                elif h in cid_set and t not in cid_set:\n",
    "                    t_buffer[t].append(h)\n",
    "                \n",
    "                elif h not in cid_set and t in cid_set:\n",
    "                    h_buffer[h].append(t)\n",
    "            except:\n",
    "                continue\n",
    "                \n",
    "    return triple_set, h_buffer, t_buffer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2component ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_compound2component.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs = lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 23769818/23769818 [00:40<00:00, 583799.01it/s] \n"
     ]
    }
   ],
   "source": [
    "triple_set, h_buffer, t_buffer = retrieve(triple_strs, relation=\"has_component\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "511 27341\n"
     ]
    }
   ],
   "source": [
    "cnt = 0\n",
    "\n",
    "for key in h_buffer.keys():\n",
    "    if len(h_buffer[key]) > 30:\n",
    "        cnt+=1\n",
    "        for t in h_buffer[key]:\n",
    "            triple_set.add((key, 'has_component', t))\n",
    "print(cnt, len(triple_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_component.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2parent ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_compound2parent.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs = lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 6943268/6943268 [00:05<00:00, 1239920.57it/s]\n"
     ]
    }
   ],
   "source": [
    "triple_set, h_buffer, t_buffer = retrieve(triple_strs, relation=\"has_parent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "44 5223\n"
     ]
    }
   ],
   "source": [
    "cnt = 0\n",
    "\n",
    "for key in t_buffer.keys():\n",
    "    if len(t_buffer[key]) > 2:\n",
    "        cnt+=1\n",
    "        for h in t_buffer[key]:\n",
    "            triple_set.add((h, 'has_parent', key))\n",
    "print(cnt, len(triple_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_parent.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2stereoisomer ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_compound2stereoisomer_000001.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs_1 = lines.split(' .\\n')[2:]\n",
    "\n",
    "with open(dir_ + 'pc_compound2stereoisomer_000002.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs_2 = lines.split(' .\\n')[2:]\n",
    "\n",
    "with open(dir_ + 'pc_compound2stereoisomer_000003.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs_3 = lines.split(' .\\n')[2:]\n",
    "\n",
    "triple_strs = triple_strs_1 + triple_strs_2 + triple_strs_3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 24904630/24904630 [00:38<00:00, 649312.21it/s] \n"
     ]
    }
   ],
   "source": [
    "triple_set, h_buffer, t_buffer = retrieve(triple_strs, relation=\"has_stereoisomer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1229 1229 59370 63403\n"
     ]
    }
   ],
   "source": [
    "cnt_t = 0\n",
    "\n",
    "for key in t_buffer.keys():\n",
    "    if len(t_buffer[key]) > 10:\n",
    "        cnt_t+=1\n",
    "        for h in t_buffer[key]:\n",
    "            triple_set.add((h, 'has_stereoisomer', key))\n",
    "        cid_set.add(key)\n",
    "\n",
    "cnt_h = 0\n",
    "for key in h_buffer.keys():\n",
    "    if len(h_buffer[key]) > 10:\n",
    "        cnt_h+=1\n",
    "        for t in h_buffer[key]:\n",
    "            triple_set.add((key, 'has_stereoisomer', t))\n",
    "        cid_set.add(key)\n",
    "\n",
    "print(cnt_t, cnt_h, len(triple_set), len(cid_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_stereoisomer.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2same_connectivity ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_compound2sameconnectivity.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs = lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2821372/2821372 [00:02<00:00, 1357019.35it/s]\n"
     ]
    }
   ],
   "source": [
    "triple_set, h_buffer, t_buffer = retrieve(triple_strs, relation=\"has_same_connectivity\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "284 284 85538\n"
     ]
    }
   ],
   "source": [
    "cnt_t = 0\n",
    "\n",
    "for key in t_buffer.keys():\n",
    "    if len(t_buffer[key]) > 10:\n",
    "        cnt_t+=1\n",
    "        for h in t_buffer[key]:\n",
    "            triple_set.add((h, 'has_same_connectivity', key))\n",
    "        cid_set.add(key)\n",
    "\n",
    "cnt_h = 0\n",
    "for key in h_buffer.keys():\n",
    "    if len(h_buffer[key]) > 10:\n",
    "        cnt_h+=1\n",
    "        for t in h_buffer[key]:\n",
    "            triple_set.add((key, 'has_same_connectivity', t))\n",
    "        cid_set.add(key)\n",
    "\n",
    "print(cnt_t, cnt_h, len(triple_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_connectivity.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2isotopologue ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_compound2isotopologue.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs = lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 368953/368953 [00:01<00:00, 202670.62it/s]\n"
     ]
    }
   ],
   "source": [
    "triple_set, h_buffer, t_buffer = retrieve(triple_strs, relation=\"has_isotopologue\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "51 51 4436 63738\n"
     ]
    }
   ],
   "source": [
    "cnt_t = 0\n",
    "\n",
    "for key in t_buffer.keys():\n",
    "    if len(t_buffer[key]) > 3:\n",
    "        cnt_t+=1\n",
    "        for h in t_buffer[key]:\n",
    "            triple_set.add((h, 'has_isotopologue', key))\n",
    "        cid_set.add(key)\n",
    "\n",
    "cnt_h = 0\n",
    "for key in h_buffer.keys():\n",
    "    if len(h_buffer[key]) > 3:\n",
    "        cnt_h+=1\n",
    "        for t in h_buffer[key]:\n",
    "            triple_set.add((key, 'has_isotopologue', t))\n",
    "        cid_set.add(key)\n",
    "\n",
    "print(cnt_t, cnt_h, len(triple_set), len(cid_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_isotopologue.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2drugproduct ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_compound2drugproduct.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs = lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 17179/17179 [00:00<00:00, 830966.64it/s]\n"
     ]
    }
   ],
   "source": [
    "triple_set, h_buffer, t_buffer = retrieve(triple_strs, relation=\"to_drug\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "799 51 3708 63738\n"
     ]
    }
   ],
   "source": [
    "cnt_t = 0\n",
    "\n",
    "for key in t_buffer.keys():\n",
    "    if len(t_buffer[key]) > 2:\n",
    "        cnt_t+=1\n",
    "        for h in t_buffer[key]:\n",
    "            triple_set.add((h, 'to_drug', key))\n",
    "\n",
    "\n",
    "print(cnt_t, cnt_h, len(triple_set), len(cid_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    t = t.replace(\"<http://purl.bioontology.org/ontology/SNOMEDCT/\", \"SNOMED_\").replace(\">\", \"\").replace(\"ns2:\", \"ns2_\")\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_todrug.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2closematch ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_compound_closematch.ttl000001.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs = lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 951092/951092 [00:00<00:00, 1173898.53it/s]\n"
     ]
    }
   ],
   "source": [
    "_, _, t_buffer = retrieve(triple_strs, relation=\"closematch\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "39241 51 42949 63738\n"
     ]
    }
   ],
   "source": [
    "cnt_t = 0\n",
    "\n",
    "for key in t_buffer.keys():\n",
    "        cnt_t+=1\n",
    "        for h in t_buffer[key]:\n",
    "            triple_set.add((h, 'closematch', key))\n",
    "\n",
    "\n",
    "print(cnt_t, cnt_h, len(triple_set), len(cid_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    t = t.replace(\":\", \"_\")\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_closematch.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2type ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_compound_type.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs = lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 151979/151979 [00:00<00:00, 1095910.37it/s]\n"
     ]
    }
   ],
   "source": [
    "_, _, t_buffer = retrieve(triple_strs, relation=\"type\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1189 51 48360 63738\n"
     ]
    }
   ],
   "source": [
    "cnt_t = 0\n",
    "\n",
    "for key in t_buffer.keys():\n",
    "    if len(t_buffer[key]) > 2:\n",
    "        cnt_t+=1\n",
    "        for h in t_buffer[key]:\n",
    "            triple_set.add((h, 'type', key))\n",
    "\n",
    "\n",
    "print(cnt_t, cnt_h, len(triple_set), len(cid_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    t = t.replace(\"<http://purl.bioontology.org/ontology/SNOMEDCT/\", \"SNOMED_\").replace(\">\", \"\").replace(\":\", \"_\")\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_totype.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2descriptor ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 160/160 [02:52<00:00,  1.08s/it]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "triple_strs = []\n",
    "\n",
    "for i in tqdm(range(1, 161)):\n",
    "    if len(str(i)) == 1:\n",
    "        idx = \"00\" + str(i)\n",
    "    if len(str(i)) == 2:\n",
    "        idx = \"0\" + str(i)\n",
    "    if len(str(i)) == 3:\n",
    "        idx = str(i)\n",
    "    \n",
    "    file_name = dir_ + f'pc_compound2descriptor_000{idx}.ttl'\n",
    "\n",
    "    with open(file_name, 'r') as f:\n",
    "        lines = f.read()\n",
    "\n",
    "    triple_strs += lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 114013741/114013741 [15:17<00:00, 124266.80it/s]\n"
     ]
    }
   ],
   "source": [
    "triple_set, h_buffer, t_buffer = retrieve(triple_strs, relation=\"has_descriptor\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 51 1456589 63738\n"
     ]
    }
   ],
   "source": [
    "cnt_t = 0\n",
    "\n",
    "for key in t_buffer.keys():\n",
    "    for h in t_buffer[key]:\n",
    "        triple_set.add((h, 'has_descriptor', key))\n",
    "\n",
    "\n",
    "print(cnt_t, cnt_h, len(triple_set), len(cid_set))\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_descriptor.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "***=============================== compound2pathway ===============================***"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(dir_ + 'pc_pathway.ttl', 'r') as f:\n",
    "    lines = f.read()\n",
    "\n",
    "triple_strs = lines.split(' .\\n')[2:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 494606/494606 [00:01<00:00, 468974.82it/s]\n"
     ]
    }
   ],
   "source": [
    "triple_set, h_buffer, t_buffer = retrieve(triple_strs, relation=\"in_pathway\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "cnt_t = 0\n",
    "\n",
    "triple_set = set()\n",
    "for key in h_buffer.keys():\n",
    "    tmp_set = set()\n",
    "    for cid in h_buffer[key]:\n",
    "        if cid in cid_set:\n",
    "            tmp_set.add((cid, 'in_pathway', key.replace('pathway:', '')))\n",
    "        if len(tmp_set) > 8:\n",
    "            triple_set.update(tmp_set)\n",
    "\n",
    "out_str = \"\"\n",
    "\n",
    "for triple in triple_set:\n",
    "    h, r, t = triple\n",
    "    out_str += h +'\\t' + r  + '\\t' + t +'\\n'\n",
    "\n",
    "with open('./compound_pathway.txt', 'w') as f:\n",
    "    f.write(out_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.16 ('txgnn_env')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "79cb95e61c4f960f4e102f21c45668d32cb5c494b237694c15d64b50342e6e99"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
