{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_124327/2610146637.py:3: DtypeWarning: Columns (1,4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df_prime = pd.read_csv('/data/pj20/txgnn/kg/kg_directed.csv')\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd \n",
    "\n",
    "df_prime = pd.read_csv('/data/pj20/txgnn/kg/kg_directed.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "      <th>x_idx</th>\n",
       "      <th>y_idx</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>gene/protein</td>\n",
       "      <td>9796.0</td>\n",
       "      <td>protein_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>56992.0</td>\n",
       "      <td>27422.0</td>\n",
       "      <td>19536.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>gene/protein</td>\n",
       "      <td>7918.0</td>\n",
       "      <td>protein_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>9240.0</td>\n",
       "      <td>23886.0</td>\n",
       "      <td>26764.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>gene/protein</td>\n",
       "      <td>8233.0</td>\n",
       "      <td>protein_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>23548.0</td>\n",
       "      <td>24822.0</td>\n",
       "      <td>10205.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>gene/protein</td>\n",
       "      <td>4899.0</td>\n",
       "      <td>protein_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>11253.0</td>\n",
       "      <td>16773.0</td>\n",
       "      <td>5880.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>gene/protein</td>\n",
       "      <td>5297.0</td>\n",
       "      <td>protein_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>8601.0</td>\n",
       "      <td>17769.0</td>\n",
       "      <td>25909.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4050244</th>\n",
       "      <td>disease</td>\n",
       "      <td>16982.0</td>\n",
       "      <td>disease_phenotype_positive</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>30448.0</td>\n",
       "      <td>4676.0</td>\n",
       "      <td>6378.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4050245</th>\n",
       "      <td>disease</td>\n",
       "      <td>19314_19023_2726</td>\n",
       "      <td>disease_phenotype_positive</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>8069.0</td>\n",
       "      <td>6562.0</td>\n",
       "      <td>13832.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4050246</th>\n",
       "      <td>disease</td>\n",
       "      <td>19314_19023_2726</td>\n",
       "      <td>disease_phenotype_positive</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>100495.0</td>\n",
       "      <td>6562.0</td>\n",
       "      <td>523.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4050247</th>\n",
       "      <td>disease</td>\n",
       "      <td>4747.0</td>\n",
       "      <td>disease_phenotype_positive</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>202.0</td>\n",
       "      <td>12414.0</td>\n",
       "      <td>4558.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4050248</th>\n",
       "      <td>disease</td>\n",
       "      <td>5297.0</td>\n",
       "      <td>disease_phenotype_positive</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>795.0</td>\n",
       "      <td>12891.0</td>\n",
       "      <td>13759.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4050249 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               x_type              x_id                    relation  \\\n",
       "0        gene/protein            9796.0             protein_protein   \n",
       "1        gene/protein            7918.0             protein_protein   \n",
       "2        gene/protein            8233.0             protein_protein   \n",
       "3        gene/protein            4899.0             protein_protein   \n",
       "4        gene/protein            5297.0             protein_protein   \n",
       "...               ...               ...                         ...   \n",
       "4050244       disease           16982.0  disease_phenotype_positive   \n",
       "4050245       disease  19314_19023_2726  disease_phenotype_positive   \n",
       "4050246       disease  19314_19023_2726  disease_phenotype_positive   \n",
       "4050247       disease            4747.0  disease_phenotype_positive   \n",
       "4050248       disease            5297.0  disease_phenotype_positive   \n",
       "\n",
       "                   y_type      y_id    x_idx    y_idx  \n",
       "0            gene/protein   56992.0  27422.0  19536.0  \n",
       "1            gene/protein    9240.0  23886.0  26764.0  \n",
       "2            gene/protein   23548.0  24822.0  10205.0  \n",
       "3            gene/protein   11253.0  16773.0   5880.0  \n",
       "4            gene/protein    8601.0  17769.0  25909.0  \n",
       "...                   ...       ...      ...      ...  \n",
       "4050244  effect/phenotype   30448.0   4676.0   6378.0  \n",
       "4050245  effect/phenotype    8069.0   6562.0  13832.0  \n",
       "4050246  effect/phenotype  100495.0   6562.0    523.0  \n",
       "4050247  effect/phenotype     202.0  12414.0   4558.0  \n",
       "4050248  effect/phenotype     795.0  12891.0  13759.0  \n",
       "\n",
       "[4050249 rows x 7 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_prime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_prime = df_prime[['x_type', 'x_id', 'relation', 'y_type', 'y_id']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_drug_src = df_prime[df_prime.x_type == 'drug']\n",
    "df_drug_dst = df_prime[df_prime.y_type == 'drug']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['disease', 'drug', 'effect/phenotype', 'gene/protein'],\n",
       "       dtype=object),\n",
       " array(['contraindication', 'drug_drug', 'drug_effect', 'drug_protein',\n",
       "        'indication', 'off-label use'], dtype=object))"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "np.unique(df_drug_src.y_type.values), np.unique(df_drug_src.relation.values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('./prime_kg_db_cid.json', 'r') as f:\n",
    "    db2cid = json.load(f)\n",
    "\n",
    "with open('./primekg_id_mapping.json' , 'r') as f:\n",
    "    protein_id2name = json.load(f)['id2name_protein']\n",
    "\n",
    "with open('./primekg_disease_id_to_dzid.json', 'r') as f:\n",
    "    disease_id2dzid = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>321075</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB09130</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>2157.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321076</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB09130</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>2153.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321077</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB09140</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>3040.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321078</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB00180</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>866.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321079</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB00240</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>866.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076036</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB00105</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>2595.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076037</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB00105</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1618.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076038</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB00105</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1677.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076039</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB00105</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1262.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076040</th>\n",
       "      <td>drug</td>\n",
       "      <td>DB00105</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>2791.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1469382 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        x_type     x_id      relation            y_type    y_id\n",
       "321075    drug  DB09130  drug_protein      gene/protein  2157.0\n",
       "321076    drug  DB09130  drug_protein      gene/protein  2153.0\n",
       "321077    drug  DB09140  drug_protein      gene/protein  3040.0\n",
       "321078    drug  DB00180  drug_protein      gene/protein   866.0\n",
       "321079    drug  DB00240  drug_protein      gene/protein   866.0\n",
       "...        ...      ...           ...               ...     ...\n",
       "2076036   drug  DB00105   drug_effect  effect/phenotype  2595.0\n",
       "2076037   drug  DB00105   drug_effect  effect/phenotype  1618.0\n",
       "2076038   drug  DB00105   drug_effect  effect/phenotype  1677.0\n",
       "2076039   drug  DB00105   drug_effect  effect/phenotype  1262.0\n",
       "2076040   drug  DB00105   drug_effect  effect/phenotype  2791.0\n",
       "\n",
       "[1469382 rows x 5 columns]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_drug_src"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_124327/2912432475.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df_drug_src['x_id'] = df_drug_src['x_id'].map(db2cid)\n"
     ]
    }
   ],
   "source": [
    "df_drug_src['x_id'] = df_drug_src['x_id'].map(db2cid)\n",
    "df_drug_src.loc[df_drug_src['y_type'] == 'gene/protein', 'y_id'] = df_drug_src.loc[df_drug_src['y_type'] == 'gene/protein', 'y_id'].map(protein_id2name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_drug_src.loc[df_drug_src['y_type'] == 'drug', 'y_id'] = df_drug_src.loc[df_drug_src['y_type'] == 'drug', 'y_id'].map(db2cid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_drug_src.loc[df_drug_src['y_type'] == 'disease', 'y_id'] = df_drug_src.loc[df_drug_src['y_type'] == 'disease', 'y_id'].map(disease_id2dzid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>321075</th>\n",
       "      <td>drug</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321076</th>\n",
       "      <td>drug</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321077</th>\n",
       "      <td>drug</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBA2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321078</th>\n",
       "      <td>drug</td>\n",
       "      <td>82153.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>SERPINA6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321079</th>\n",
       "      <td>drug</td>\n",
       "      <td>5311000.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>SERPINA6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076036</th>\n",
       "      <td>drug</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>2595.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076037</th>\n",
       "      <td>drug</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1618.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076038</th>\n",
       "      <td>drug</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1677.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076039</th>\n",
       "      <td>drug</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1262.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076040</th>\n",
       "      <td>drug</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>2791.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1469382 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        x_type        x_id      relation            y_type      y_id\n",
       "321075    drug     23978.0  drug_protein      gene/protein        F8\n",
       "321076    drug     23978.0  drug_protein      gene/protein        F5\n",
       "321077    drug       977.0  drug_protein      gene/protein      HBA2\n",
       "321078    drug     82153.0  drug_protein      gene/protein  SERPINA6\n",
       "321079    drug   5311000.0  drug_protein      gene/protein  SERPINA6\n",
       "...        ...         ...           ...               ...       ...\n",
       "2076036   drug  71306834.0   drug_effect  effect/phenotype    2595.0\n",
       "2076037   drug  71306834.0   drug_effect  effect/phenotype    1618.0\n",
       "2076038   drug  71306834.0   drug_effect  effect/phenotype    1677.0\n",
       "2076039   drug  71306834.0   drug_effect  effect/phenotype    1262.0\n",
       "2076040   drug  71306834.0   drug_effect  effect/phenotype    2791.0\n",
       "\n",
       "[1469382 rows x 5 columns]"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_drug_src"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_124327/2556175146.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df[['x_type', 'y_type']] = df[['x_type', 'y_type']].replace('drug', 'molecule')\n"
     ]
    }
   ],
   "source": [
    "df = df_drug_src\n",
    "df[['x_type', 'y_type']] = df[['x_type', 'y_type']].replace('drug', 'molecule')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>321075</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321076</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321077</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBA2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321078</th>\n",
       "      <td>molecule</td>\n",
       "      <td>82153.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>SERPINA6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321079</th>\n",
       "      <td>molecule</td>\n",
       "      <td>5311000.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>SERPINA6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076036</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>2595.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076037</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1618.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076038</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1677.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076039</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1262.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076040</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>2791.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1469382 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           x_type        x_id      relation            y_type      y_id\n",
       "321075   molecule     23978.0  drug_protein      gene/protein        F8\n",
       "321076   molecule     23978.0  drug_protein      gene/protein        F5\n",
       "321077   molecule       977.0  drug_protein      gene/protein      HBA2\n",
       "321078   molecule     82153.0  drug_protein      gene/protein  SERPINA6\n",
       "321079   molecule   5311000.0  drug_protein      gene/protein  SERPINA6\n",
       "...           ...         ...           ...               ...       ...\n",
       "2076036  molecule  71306834.0   drug_effect  effect/phenotype    2595.0\n",
       "2076037  molecule  71306834.0   drug_effect  effect/phenotype    1618.0\n",
       "2076038  molecule  71306834.0   drug_effect  effect/phenotype    1677.0\n",
       "2076039  molecule  71306834.0   drug_effect  effect/phenotype    1262.0\n",
       "2076040  molecule  71306834.0   drug_effect  effect/phenotype    2791.0\n",
       "\n",
       "[1469382 rows x 5 columns]"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_124327/2492878928.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  df['score'] = 1.0\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>321075</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F8</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321076</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F5</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321077</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBA2</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321078</th>\n",
       "      <td>molecule</td>\n",
       "      <td>82153.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>SERPINA6</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>321079</th>\n",
       "      <td>molecule</td>\n",
       "      <td>5311000.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>SERPINA6</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076036</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>2595.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076037</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1618.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076038</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1677.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076039</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>1262.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2076040</th>\n",
       "      <td>molecule</td>\n",
       "      <td>71306834.0</td>\n",
       "      <td>drug_effect</td>\n",
       "      <td>effect/phenotype</td>\n",
       "      <td>2791.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1469382 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           x_type        x_id      relation            y_type      y_id  score\n",
       "321075   molecule     23978.0  drug_protein      gene/protein        F8    1.0\n",
       "321076   molecule     23978.0  drug_protein      gene/protein        F5    1.0\n",
       "321077   molecule       977.0  drug_protein      gene/protein      HBA2    1.0\n",
       "321078   molecule     82153.0  drug_protein      gene/protein  SERPINA6    1.0\n",
       "321079   molecule   5311000.0  drug_protein      gene/protein  SERPINA6    1.0\n",
       "...           ...         ...           ...               ...       ...    ...\n",
       "2076036  molecule  71306834.0   drug_effect  effect/phenotype    2595.0    1.0\n",
       "2076037  molecule  71306834.0   drug_effect  effect/phenotype    1618.0    1.0\n",
       "2076038  molecule  71306834.0   drug_effect  effect/phenotype    1677.0    1.0\n",
       "2076039  molecule  71306834.0   drug_effect  effect/phenotype    1262.0    1.0\n",
       "2076040  molecule  71306834.0   drug_effect  effect/phenotype    2791.0    1.0\n",
       "\n",
       "[1469382 rows x 6 columns]"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['score'] = 1.0\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Descriptor**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "998810"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "desc_lines = []\n",
    "\n",
    "with open('./compound_descriptor_triple_1.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_2.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_3.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_4.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_5.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_6.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_7.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_8.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_9.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "with open('./compound_descriptor_triple_10.txt', 'r') as f:\n",
    "    desc_lines += f.readlines()\n",
    "\n",
    "len(desc_lines)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 998810/998810 [00:01<00:00, 855003.32it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "\n",
    "new_rows = []\n",
    "for line in tqdm(desc_lines):\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r.lower(), 'y_type': 'value', 'y_id': float(t), 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBA2</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>558</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBB</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9035</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>COX1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21318</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>NOX1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1514860</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>tpsa</td>\n",
       "      <td>value</td>\n",
       "      <td>34.1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1556489</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>rotatable_bond_count</td>\n",
       "      <td>value</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2409111</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>tautomer_count</td>\n",
       "      <td>value</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2420369</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>covalent_unit_count</td>\n",
       "      <td>value</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           x_type   x_id              relation        y_type  y_id  score\n",
       "2        molecule  977.0          drug_protein  gene/protein  HBA2    1.0\n",
       "558      molecule  977.0          drug_protein  gene/protein   HBB    1.0\n",
       "9035     molecule  977.0          drug_protein  gene/protein  COX1    1.0\n",
       "21318    molecule  977.0          drug_protein  gene/protein  NOX1    1.0\n",
       "1514860  molecule  977.0                  tpsa         value  34.1    1.0\n",
       "1556489  molecule  977.0  rotatable_bond_count         value   0.0    1.0\n",
       "2409111  molecule  977.0        tautomer_count         value   1.0    1.0\n",
       "2420369  molecule  977.0   covalent_unit_count         value   1.0    1.0"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.x_id == 977.0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Cooccurence**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "mol_mol = {}\n",
    "with open('./chemical_chemical_1.json', 'r') as f:\n",
    "    mol_mol.update(json.load(f))\n",
    "\n",
    "with open('./chemical_chemical_2.json', 'r') as f:\n",
    "    mol_mol.update(json.load(f))\n",
    "\n",
    "with open('./chemical_chemical_3.json', 'r') as f:\n",
    "    mol_mol.update(json.load(f))\n",
    "\n",
    "with open('./chemical_chemical_4.json', 'r') as f:\n",
    "    mol_mol.update(json.load(f))\n",
    "\n",
    "mol_dz = {}\n",
    "with open('./chemical_disease_1.json', 'r') as f:\n",
    "    mol_dz.update(json.load(f))\n",
    "\n",
    "with open('./chemical_disease_2.json', 'r') as f:\n",
    "    mol_dz.update(json.load(f))\n",
    "\n",
    "with open('./chemical_disease_3.json', 'r') as f:\n",
    "    mol_dz.update(json.load(f))\n",
    "\n",
    "with open('./chemical_disease_4.json', 'r') as f:\n",
    "    mol_dz.update(json.load(f))\n",
    "\n",
    "mol_gene = {}\n",
    "with open('./chemical_gene_1.json', 'r') as f:\n",
    "    mol_gene.update(json.load(f))\n",
    "\n",
    "with open('./chemical_gene_2.json', 'r') as f:\n",
    "    mol_gene.update(json.load(f))\n",
    "\n",
    "with open('./chemical_gene_3.json', 'r') as f:\n",
    "    mol_gene.update(json.load(f))\n",
    "\n",
    "with open('./chemical_gene_4.json', 'r') as f:\n",
    "    mol_gene.update(json.load(f))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 36010/36010 [00:00<00:00, 225941.11it/s]\n"
     ]
    }
   ],
   "source": [
    "new_rows = []\n",
    "\n",
    "for key in tqdm(mol_mol.keys()):\n",
    "    cos = mol_mol[key]\n",
    "    src = float(key)\n",
    "    for co in cos:\n",
    "        dst = float(co[0].replace('CID', ''))\n",
    "        score = float(co[1])\n",
    "        new_row = {'x_type': 'molecule', 'x_id': src, 'relation': 'cooccurence', 'y_type': 'molecule', 'y_id': dst, 'score': score}\n",
    "        new_rows.append(new_row)\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 30269/30269 [00:00<00:00, 315113.62it/s]\n"
     ]
    }
   ],
   "source": [
    "new_rows = []\n",
    "\n",
    "for key in tqdm(mol_dz.keys()):\n",
    "    cos = mol_dz[key]\n",
    "    src = float(key)\n",
    "    for co in cos:\n",
    "        dst = co[0]\n",
    "        score = float(co[1])\n",
    "        new_row = {'x_type': 'molecule', 'x_id': src, 'relation': 'cooccurence', 'y_type': 'disease', 'y_id': dst, 'score': score}\n",
    "        new_rows.append(new_row)\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 29835/29835 [00:00<00:00, 328373.05it/s]\n"
     ]
    }
   ],
   "source": [
    "new_rows = []\n",
    "\n",
    "for key in tqdm(mol_gene.keys()):\n",
    "    cos = mol_gene[key]\n",
    "    src = float(key)\n",
    "    for co in cos:\n",
    "        dst = co[0]\n",
    "        score = float(co[1])\n",
    "        new_row = {'x_type': 'molecule', 'x_id': src, 'relation': 'cooccurence', 'y_type': 'gene/protein', 'y_id': dst, 'score': score}\n",
    "        new_rows.append(new_row)\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBA2</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>545</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBB</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8765</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>COX1</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20521</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>NOX1</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1318485</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>tpsa</td>\n",
       "      <td>value</td>\n",
       "      <td>34.1</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1360114</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>rotatable_bond_count</td>\n",
       "      <td>value</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1393877</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>tautomer_count</td>\n",
       "      <td>value</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1397415</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>covalent_unit_count</td>\n",
       "      <td>value</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1412619</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>molecule</td>\n",
       "      <td>962.0</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1412620</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>molecule</td>\n",
       "      <td>947.0</td>\n",
       "      <td>0.994681</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1412621</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>molecule</td>\n",
       "      <td>5462310.0</td>\n",
       "      <td>0.862397</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1412622</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>molecule</td>\n",
       "      <td>280.0</td>\n",
       "      <td>0.699679</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1412623</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>molecule</td>\n",
       "      <td>5793.0</td>\n",
       "      <td>0.676426</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1587203</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>disease</td>\n",
       "      <td>DZID6914</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1587204</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>disease</td>\n",
       "      <td>DZID8198</td>\n",
       "      <td>0.393408</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1587205</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>disease</td>\n",
       "      <td>DZID6942</td>\n",
       "      <td>0.308140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1587206</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>disease</td>\n",
       "      <td>DZID10277</td>\n",
       "      <td>0.223410</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1587207</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>disease</td>\n",
       "      <td>DZID8607</td>\n",
       "      <td>0.200368</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1719805</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>EC_1.15.1.1</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1719806</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>CAT</td>\n",
       "      <td>0.814292</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1719807</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HIF1A</td>\n",
       "      <td>0.456201</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1719808</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>EC_1.10.3.9</td>\n",
       "      <td>0.454179</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1719809</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>VEGFA</td>\n",
       "      <td>0.453207</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           x_type   x_id              relation        y_type         y_id  \\\n",
       "2        molecule  977.0          drug_protein  gene/protein         HBA2   \n",
       "545      molecule  977.0          drug_protein  gene/protein          HBB   \n",
       "8765     molecule  977.0          drug_protein  gene/protein         COX1   \n",
       "20521    molecule  977.0          drug_protein  gene/protein         NOX1   \n",
       "1318485  molecule  977.0                  tpsa         value         34.1   \n",
       "1360114  molecule  977.0  rotatable_bond_count         value          0.0   \n",
       "1393877  molecule  977.0        tautomer_count         value          1.0   \n",
       "1397415  molecule  977.0   covalent_unit_count         value          1.0   \n",
       "1412619  molecule  977.0           cooccurence      molecule        962.0   \n",
       "1412620  molecule  977.0           cooccurence      molecule        947.0   \n",
       "1412621  molecule  977.0           cooccurence      molecule    5462310.0   \n",
       "1412622  molecule  977.0           cooccurence      molecule        280.0   \n",
       "1412623  molecule  977.0           cooccurence      molecule       5793.0   \n",
       "1587203  molecule  977.0           cooccurence       disease     DZID6914   \n",
       "1587204  molecule  977.0           cooccurence       disease     DZID8198   \n",
       "1587205  molecule  977.0           cooccurence       disease     DZID6942   \n",
       "1587206  molecule  977.0           cooccurence       disease    DZID10277   \n",
       "1587207  molecule  977.0           cooccurence       disease     DZID8607   \n",
       "1719805  molecule  977.0           cooccurence  gene/protein  EC_1.15.1.1   \n",
       "1719806  molecule  977.0           cooccurence  gene/protein          CAT   \n",
       "1719807  molecule  977.0           cooccurence  gene/protein        HIF1A   \n",
       "1719808  molecule  977.0           cooccurence  gene/protein  EC_1.10.3.9   \n",
       "1719809  molecule  977.0           cooccurence  gene/protein        VEGFA   \n",
       "\n",
       "            score  \n",
       "2        1.000000  \n",
       "545      1.000000  \n",
       "8765     1.000000  \n",
       "20521    1.000000  \n",
       "1318485  1.000000  \n",
       "1360114  1.000000  \n",
       "1393877  1.000000  \n",
       "1397415  1.000000  \n",
       "1412619  1.000000  \n",
       "1412620  0.994681  \n",
       "1412621  0.862397  \n",
       "1412622  0.699679  \n",
       "1412623  0.676426  \n",
       "1587203  1.000000  \n",
       "1587204  0.393408  \n",
       "1587205  0.308140  \n",
       "1587206  0.223410  \n",
       "1587207  0.200368  \n",
       "1719805  1.000000  \n",
       "1719806  0.814292  \n",
       "1719807  0.456201  \n",
       "1719808  0.454179  \n",
       "1719809  0.453207  "
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.x_id == 977.0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Neighbor**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./chemical_nbr2d.json', 'r') as f:\n",
    "    mol_nbr2d = json.load(f)\n",
    "\n",
    "with open('./chemical_nbr3d.json', 'r') as f:\n",
    "    mol_nbr3d = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 56679/56679 [00:00<00:00, 898236.08it/s]\n"
     ]
    }
   ],
   "source": [
    "new_rows = []\n",
    "\n",
    "for key in tqdm(mol_nbr2d.keys()):\n",
    "    if mol_nbr2d[key] == None:\n",
    "        continue\n",
    "    if type(mol_nbr2d[key]) == list:\n",
    "        nbr = float(mol_nbr2d[key][0])\n",
    "    else:\n",
    "        nbr = float(mol_nbr2d[key])\n",
    "    src = float(key)\n",
    "    new_row = {'x_type': 'molecule', 'x_id': src, 'relation': 'neighbor_2d', 'y_type': 'molecule', 'y_id': nbr, 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 51665/51665 [00:00<00:00, 910781.99it/s]\n"
     ]
    }
   ],
   "source": [
    "new_rows = []\n",
    "\n",
    "for key in tqdm(mol_nbr3d.keys()):\n",
    "    if mol_nbr3d[key] == None:\n",
    "        continue\n",
    "    if type(mol_nbr3d[key]) == list:\n",
    "        nbr = float(mol_nbr3d[key][0])\n",
    "    else:\n",
    "        nbr = float(mol_nbr3d[key])\n",
    "    src = float(key)\n",
    "    new_row = {'x_type': 'molecule', 'x_id': src, 'relation': 'neighbor_3d', 'y_type': 'molecule', 'y_id': nbr, 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F8</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>F5</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>426</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>ALB</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>600</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>A2M</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>680</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>CP</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1719802</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>ATP7B</td>\n",
       "      <td>0.460951</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1719803</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>EC_1.9.3.1</td>\n",
       "      <td>0.352543</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1719804</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>cooccurence</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>CAT</td>\n",
       "      <td>0.332801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1849158</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>neighbor_2d</td>\n",
       "      <td>molecule</td>\n",
       "      <td>78.0</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1902652</th>\n",
       "      <td>molecule</td>\n",
       "      <td>23978.0</td>\n",
       "      <td>neighbor_3d</td>\n",
       "      <td>molecule</td>\n",
       "      <td>78.0</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>207 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           x_type     x_id      relation        y_type        y_id     score\n",
       "0        molecule  23978.0  drug_protein  gene/protein          F8  1.000000\n",
       "1        molecule  23978.0  drug_protein  gene/protein          F5  1.000000\n",
       "426      molecule  23978.0  drug_protein  gene/protein         ALB  1.000000\n",
       "600      molecule  23978.0  drug_protein  gene/protein         A2M  1.000000\n",
       "680      molecule  23978.0  drug_protein  gene/protein          CP  1.000000\n",
       "...           ...      ...           ...           ...         ...       ...\n",
       "1719802  molecule  23978.0   cooccurence  gene/protein       ATP7B  0.460951\n",
       "1719803  molecule  23978.0   cooccurence  gene/protein  EC_1.9.3.1  0.352543\n",
       "1719804  molecule  23978.0   cooccurence  gene/protein         CAT  0.332801\n",
       "1849158  molecule  23978.0   neighbor_2d      molecule        78.0  1.000000\n",
       "1902652  molecule  23978.0   neighbor_3d      molecule        78.0  1.000000\n",
       "\n",
       "[207 rows x 6 columns]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.x_id == 23978.0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Same_connectivity**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./compound_connectivity.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_rows = []\n",
    "for line in lines:\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r, 'y_type': 'molecule', 'y_id': float(t), 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Component**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./compound_component.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_rows = []\n",
    "for line in lines:\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r, 'y_type': 'molecule', 'y_id': float(t), 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Isotopologue**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./compound_isotopologue.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_rows = []\n",
    "for line in lines:\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r, 'y_type': 'molecule', 'y_id': float(t), 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Parent**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./compound_parent.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_rows = []\n",
    "for line in lines:\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r, 'y_type': 'molecule', 'y_id': float(t), 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Stereoisomer**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./compound_stereoisomer.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_rows = []\n",
    "for line in lines:\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r, 'y_type': 'molecule', 'y_id': float(t), 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Compound2drug**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./compound_todrug.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_rows = []\n",
    "for line in lines:\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r, 'y_type': 'drug', 'y_id': t, 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Compound2type**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./compound_totype.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_rows = []\n",
    "for line in lines:\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r, 'y_type': 'drug', 'y_id': t, 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Pathway**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('./compound_pathway.txt', 'r') as f:\n",
    "    lines = f.readlines()\n",
    "\n",
    "new_rows = []\n",
    "for line in lines:\n",
    "    h, r, t = line[:-1].split('\\t')\n",
    "    new_row = {'x_type': 'molecule', 'x_id': float(h), 'relation': r, 'y_type': 'pathway', 'y_id': t, 'score': 1.0}\n",
    "    new_rows.append(new_row)\n",
    "\n",
    "\n",
    "df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n",
    "df = df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_type</th>\n",
       "      <th>x_id</th>\n",
       "      <th>relation</th>\n",
       "      <th>y_type</th>\n",
       "      <th>y_id</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBA2</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>545</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>HBB</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8765</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>COX1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20521</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>drug_protein</td>\n",
       "      <td>gene/protein</td>\n",
       "      <td>NOX1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1318485</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>tpsa</td>\n",
       "      <td>value</td>\n",
       "      <td>34.1</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2525546</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>in_pathway</td>\n",
       "      <td>pathway</td>\n",
       "      <td>PWID24198</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2525812</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>in_pathway</td>\n",
       "      <td>pathway</td>\n",
       "      <td>PWID1184698</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2526069</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>in_pathway</td>\n",
       "      <td>pathway</td>\n",
       "      <td>PWID1184675</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2526370</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>in_pathway</td>\n",
       "      <td>pathway</td>\n",
       "      <td>PWID1282200</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2526659</th>\n",
       "      <td>molecule</td>\n",
       "      <td>977.0</td>\n",
       "      <td>in_pathway</td>\n",
       "      <td>pathway</td>\n",
       "      <td>PWID1184528</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1248 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           x_type   x_id      relation        y_type         y_id  score\n",
       "2        molecule  977.0  drug_protein  gene/protein         HBA2    1.0\n",
       "545      molecule  977.0  drug_protein  gene/protein          HBB    1.0\n",
       "8765     molecule  977.0  drug_protein  gene/protein         COX1    1.0\n",
       "20521    molecule  977.0  drug_protein  gene/protein         NOX1    1.0\n",
       "1318485  molecule  977.0          tpsa         value         34.1    1.0\n",
       "...           ...    ...           ...           ...          ...    ...\n",
       "2525546  molecule  977.0    in_pathway       pathway    PWID24198    1.0\n",
       "2525812  molecule  977.0    in_pathway       pathway  PWID1184698    1.0\n",
       "2526069  molecule  977.0    in_pathway       pathway  PWID1184675    1.0\n",
       "2526370  molecule  977.0    in_pathway       pathway  PWID1282200    1.0\n",
       "2526659  molecule  977.0    in_pathway       pathway  PWID1184528    1.0\n",
       "\n",
       "[1248 rows x 6 columns]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.x_id == 977.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"KG.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_124327/2894252720.py:3: DtypeWarning: Columns (4) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(\"KG.csv\")\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df = pd.read_csv(\"KG.csv\")\n",
    "\n",
    "# Select rows where relation is 'cooccurrence'\n",
    "mask = (df['relation'] == 'cooccurence')\n",
    "\n",
    "# Update the value of the 'relation' column for these rows\n",
    "df.loc[mask, 'relation'] = 'cooccurence_' + df.loc[mask, 'x_type'] + '_' + df.loc[mask, 'y_type']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['drug_protein', 'contraindication', 'indication', 'off-label use',\n",
       "       'drug_drug', 'drug_effect', 'defined_bond_stereo_count', 'tpsa',\n",
       "       'rotatable_bond_count', 'xlogp3-aa', 'structure_complexity',\n",
       "       'covalent_unit_count', 'defined_atom_stereo_count',\n",
       "       'molecular_weight', 'hydrogen_bond_donor_count',\n",
       "       'undefined_bond_stereo_count', 'isotope_atom_count', 'exact_mass',\n",
       "       'mono_isotopic_weight', 'total_formal_charge',\n",
       "       'hydrogen_bond_acceptor_count', 'non-hydrogen_atom_count',\n",
       "       'tautomer_count', 'undefined_atom_stereo_count', 'xlogp3',\n",
       "       'cooccurence_molecule_molecule', 'cooccurence_molecule_disease',\n",
       "       'cooccurence_molecule_gene/protein', 'neighbor_2d', 'neighbor_3d',\n",
       "       'has_same_connectivity', 'has_component', 'has_isotopologue',\n",
       "       'has_parent', 'has_stereoisomer', 'to_drug', 'closematch', 'type',\n",
       "       'in_pathway'], dtype=object)"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.relation.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv(\"KG.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.16 ('txgnn_env')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "79cb95e61c4f960f4e102f21c45668d32cb5c494b237694c15d64b50342e6e99"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
