{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "pd.set_option(\"display.max_columns\", None)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "nanozymes_df = pd.read_excel(\"nanozymes.xlsx\")\n",
    "\n",
    "\n",
    "def extract_id_from_filename(filename):\n",
    "    return filename[:-3].upper()\n",
    "\n",
    "\n",
    "def extract_id_from_link(link):\n",
    "    return link.split(\"/\")[-1].upper()\n",
    "\n",
    "\n",
    "nanozymes_df[\"article_id\"] = nanozymes_df[\"link\"].apply(extract_id_from_link)\n",
    "\n",
    "column_rename_map = {\n",
    "    \"Syngony\": \"syngony\",\n",
    "    \"length, nm\": \"length\",\n",
    "    \"width, nm\": \"width\",\n",
    "    \"depth, nm\": \"depth\",\n",
    "    \"pol\": \"polymer\",\n",
    "    \"surf\": \"surfactants\",\n",
    "    \"Mw(coat), g/mol\": \"molar_mass\",\n",
    "    \"Km, mM\": \"km\",\n",
    "    \"Vmax, mM/s\": \"v_max\",\n",
    "    \"ReactionType\": \"reaction_type\",\n",
    "    \"C min, mM\": \"c_min\",\n",
    "    \"C max, mM\": \"c_max\",\n",
    "    \"C(const), mM\": \"c_const\",\n",
    "    \"Ccat(mg/mL)\": \"ccat\",\n",
    "    \"ph\": \"ph\",\n",
    "    \"temp, °C\": \"temperature\",\n",
    "}\n",
    "\n",
    "nanozymes_df.rename(columns=column_rename_map, inplace=True)\n",
    "\n",
    "num_cols = [\n",
    "    \"syngony\",\n",
    "    \"length\",\n",
    "    \"width\",\n",
    "    \"depth\",\n",
    "    \"molar_mass\",\n",
    "    \"km\",\n",
    "    \"v_max\",\n",
    "    \"c_min\",\n",
    "    \"c_max\",\n",
    "    \"c_const\",\n",
    "    \"ccat\",\n",
    "    \"ph\",\n",
    "    \"temperature\",\n",
    "]\n",
    "for col in num_cols:\n",
    "    nanozymes_df[col] = pd.to_numeric(nanozymes_df[col], errors=\"coerce\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>formula</th>\n",
       "      <th>activity</th>\n",
       "      <th>syngony</th>\n",
       "      <th>length</th>\n",
       "      <th>width</th>\n",
       "      <th>depth</th>\n",
       "      <th>surface</th>\n",
       "      <th>polymer</th>\n",
       "      <th>surfactants</th>\n",
       "      <th>molar_mass</th>\n",
       "      <th>km</th>\n",
       "      <th>v_max</th>\n",
       "      <th>reaction_type</th>\n",
       "      <th>c_min</th>\n",
       "      <th>c_max</th>\n",
       "      <th>c_const</th>\n",
       "      <th>ccat</th>\n",
       "      <th>ph</th>\n",
       "      <th>temperature</th>\n",
       "      <th>link</th>\n",
       "      <th>article_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>4.1</td>\n",
       "      <td>4.1</td>\n",
       "      <td>4.1</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.00645</td>\n",
       "      <td>1.376300</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>1.500</td>\n",
       "      <td>1.50</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.8</td>\n",
       "      <td>13.8</td>\n",
       "      <td>13.8</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.05537</td>\n",
       "      <td>0.264300</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>1.500</td>\n",
       "      <td>1.50</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>4.1</td>\n",
       "      <td>4.1</td>\n",
       "      <td>4.1</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.03551</td>\n",
       "      <td>8.363000</td>\n",
       "      <td>H2O2 + TMB</td>\n",
       "      <td>0.500</td>\n",
       "      <td>25.00</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.8</td>\n",
       "      <td>13.8</td>\n",
       "      <td>13.8</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.22769</td>\n",
       "      <td>0.438200</td>\n",
       "      <td>H2O2 + TMB</td>\n",
       "      <td>0.500</td>\n",
       "      <td>25.00</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>24.5</td>\n",
       "      <td>24.5</td>\n",
       "      <td>24.5</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.01725</td>\n",
       "      <td>1.027200</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>0.200</td>\n",
       "      <td>100.00</td>\n",
       "      <td>15.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1173</th>\n",
       "      <td>Ti3C2</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>6.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ssDNA</td>\n",
       "      <td>DNA</td>\n",
       "      <td>Tetramethylammonium hydroxide</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.85600</td>\n",
       "      <td>0.000080</td>\n",
       "      <td>OPD + H2O2</td>\n",
       "      <td>0.001</td>\n",
       "      <td>2.00</td>\n",
       "      <td>10.00</td>\n",
       "      <td>0.005000</td>\n",
       "      <td>4.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>https://doi.org/10.1016/j.microc.2021.106238</td>\n",
       "      <td>J.MICROC.2021.106238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1174</th>\n",
       "      <td>Au</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>ss-DNA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>17.67000</td>\n",
       "      <td>0.000180</td>\n",
       "      <td>glucose</td>\n",
       "      <td>5.000</td>\n",
       "      <td>25.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>https://doi.org/10.1002/anie.201105121</td>\n",
       "      <td>ANIE.201105121</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1175</th>\n",
       "      <td>Au</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>ds-DNA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>6.98000</td>\n",
       "      <td>0.000530</td>\n",
       "      <td>glucose</td>\n",
       "      <td>5.000</td>\n",
       "      <td>25.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>https://doi.org/10.1002/anie.201105121</td>\n",
       "      <td>ANIE.201105121</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1176</th>\n",
       "      <td>Au</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>DNA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.97000</td>\n",
       "      <td>0.000630</td>\n",
       "      <td>glucose</td>\n",
       "      <td>0.010</td>\n",
       "      <td>0.25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.2</td>\n",
       "      <td>25.0</td>\n",
       "      <td>https://doi.org/10.1021/nn102592h</td>\n",
       "      <td>NN102592H</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1177</th>\n",
       "      <td>Pt</td>\n",
       "      <td>laccase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>4.6</td>\n",
       "      <td>4.6</td>\n",
       "      <td>4.6</td>\n",
       "      <td>DNA</td>\n",
       "      <td>0</td>\n",
       "      <td>p-Dimethylaminobenzaldehyde</td>\n",
       "      <td>149.19</td>\n",
       "      <td>0.12000</td>\n",
       "      <td>0.000142</td>\n",
       "      <td>Dichlorophenol  + aminoantipyrine</td>\n",
       "      <td>0.001</td>\n",
       "      <td>2.00</td>\n",
       "      <td>0.32</td>\n",
       "      <td>0.001400</td>\n",
       "      <td>7.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>https://doi.org/10.1007/s10562-017-2106-5</td>\n",
       "      <td>S10562-017-2106-5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1178 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      formula    activity  syngony  length  width  depth surface     polymer  \\\n",
       "0     CoFe2O4  peroxidase      7.0     4.1    4.1    4.1   naked  oleic acid   \n",
       "1     CoFe2O4  peroxidase      7.0    13.8   13.8   13.8   naked  oleic acid   \n",
       "2     CoFe2O4  peroxidase      7.0     4.1    4.1    4.1   naked  oleic acid   \n",
       "3     CoFe2O4  peroxidase      7.0    13.8   13.8   13.8   naked  oleic acid   \n",
       "4     CoFe2O4  peroxidase      7.0    24.5   24.5   24.5   naked  oleic acid   \n",
       "...       ...         ...      ...     ...    ...    ...     ...         ...   \n",
       "1173    Ti3C2  peroxidase      6.0     NaN    NaN    NaN   ssDNA         DNA   \n",
       "1174       Au     oxidase      7.0    13.0   13.0   13.0  ss-DNA           0   \n",
       "1175       Au     oxidase      7.0    13.0   13.0   13.0  ds-DNA           0   \n",
       "1176       Au     oxidase      7.0    13.0   13.0   13.0     DNA           0   \n",
       "1177       Pt     laccase      7.0     4.6    4.6    4.6     DNA           0   \n",
       "\n",
       "                        surfactants  molar_mass        km     v_max  \\\n",
       "0                                 0      282.47   0.00645  1.376300   \n",
       "1                                 0      282.47   0.05537  0.264300   \n",
       "2                                 0      282.47   0.03551  8.363000   \n",
       "3                                 0      282.47   0.22769  0.438200   \n",
       "4                                 0      282.47   0.01725  1.027200   \n",
       "...                             ...         ...       ...       ...   \n",
       "1173  Tetramethylammonium hydroxide         NaN   0.85600  0.000080   \n",
       "1174                              0        0.00  17.67000  0.000180   \n",
       "1175                              0        0.00   6.98000  0.000530   \n",
       "1176                              0         NaN   6.97000  0.000630   \n",
       "1177    p-Dimethylaminobenzaldehyde      149.19   0.12000  0.000142   \n",
       "\n",
       "                          reaction_type  c_min   c_max  c_const      ccat  \\\n",
       "0                            TMB + H2O2  1.500    1.50   100.00  0.000026   \n",
       "1                            TMB + H2O2  1.500    1.50   100.00  0.000026   \n",
       "2                            H2O2 + TMB  0.500   25.00   100.00  0.000026   \n",
       "3                            H2O2 + TMB  0.500   25.00   100.00  0.000026   \n",
       "4                            TMB + H2O2  0.200  100.00    15.00  0.000026   \n",
       "...                                 ...    ...     ...      ...       ...   \n",
       "1173                         OPD + H2O2  0.001    2.00    10.00  0.005000   \n",
       "1174                            glucose  5.000   25.00      NaN       NaN   \n",
       "1175                            glucose  5.000   25.00      NaN       NaN   \n",
       "1176                            glucose  0.010    0.25      NaN       NaN   \n",
       "1177  Dichlorophenol  + aminoantipyrine  0.001    2.00     0.32  0.001400   \n",
       "\n",
       "       ph  temperature                                           link  \\\n",
       "0     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "1     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "2     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "3     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "4     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "...   ...          ...                                            ...   \n",
       "1173  4.0         40.0   https://doi.org/10.1016/j.microc.2021.106238   \n",
       "1174  NaN         34.0        https://doi.org/10.1002/anie.201105121    \n",
       "1175  NaN         34.0        https://doi.org/10.1002/anie.201105121    \n",
       "1176  7.2         25.0              https://doi.org/10.1021/nn102592h   \n",
       "1177  7.0         25.0  https://doi.org/10.1007/s10562-017-2106-5       \n",
       "\n",
       "                 article_id  \n",
       "0                C4RA15675G  \n",
       "1                C4RA15675G  \n",
       "2                C4RA15675G  \n",
       "3                C4RA15675G  \n",
       "4                C4RA15675G  \n",
       "...                     ...  \n",
       "1173   J.MICROC.2021.106238  \n",
       "1174        ANIE.201105121   \n",
       "1175        ANIE.201105121   \n",
       "1176              NN102592H  \n",
       "1177  S10562-017-2106-5      \n",
       "\n",
       "[1178 rows x 21 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nanozymes_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>formula</th>\n",
       "      <th>activity</th>\n",
       "      <th>syngony</th>\n",
       "      <th>length</th>\n",
       "      <th>width</th>\n",
       "      <th>depth</th>\n",
       "      <th>surface</th>\n",
       "      <th>polymer</th>\n",
       "      <th>surfactants</th>\n",
       "      <th>molar_mass</th>\n",
       "      <th>km</th>\n",
       "      <th>v_max</th>\n",
       "      <th>reaction_type</th>\n",
       "      <th>c_min</th>\n",
       "      <th>c_max</th>\n",
       "      <th>c_const</th>\n",
       "      <th>ccat</th>\n",
       "      <th>ph</th>\n",
       "      <th>temperature</th>\n",
       "      <th>link</th>\n",
       "      <th>article_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>4.1</td>\n",
       "      <td>4.1</td>\n",
       "      <td>4.1</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.00645</td>\n",
       "      <td>1.376300</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>1.500</td>\n",
       "      <td>1.50</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.8</td>\n",
       "      <td>13.8</td>\n",
       "      <td>13.8</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.05537</td>\n",
       "      <td>0.264300</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>1.500</td>\n",
       "      <td>1.50</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>4.1</td>\n",
       "      <td>4.1</td>\n",
       "      <td>4.1</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.03551</td>\n",
       "      <td>8.363000</td>\n",
       "      <td>H2O2 + TMB</td>\n",
       "      <td>0.500</td>\n",
       "      <td>25.00</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.8</td>\n",
       "      <td>13.8</td>\n",
       "      <td>13.8</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.22769</td>\n",
       "      <td>0.438200</td>\n",
       "      <td>H2O2 + TMB</td>\n",
       "      <td>0.500</td>\n",
       "      <td>25.00</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CoFe2O4</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>24.5</td>\n",
       "      <td>24.5</td>\n",
       "      <td>24.5</td>\n",
       "      <td>naked</td>\n",
       "      <td>oleic acid</td>\n",
       "      <td>0</td>\n",
       "      <td>282.47</td>\n",
       "      <td>0.01725</td>\n",
       "      <td>1.027200</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>0.200</td>\n",
       "      <td>100.00</td>\n",
       "      <td>15.00</td>\n",
       "      <td>0.000026</td>\n",
       "      <td>4.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>https://doi.org/10.1039/C4RA15675G</td>\n",
       "      <td>C4RA15675G</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1173</th>\n",
       "      <td>Ti3C2</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>6.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>ssDNA</td>\n",
       "      <td>DNA</td>\n",
       "      <td>Tetramethylammonium hydroxide</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.85600</td>\n",
       "      <td>0.000080</td>\n",
       "      <td>OPD + H2O2</td>\n",
       "      <td>0.001</td>\n",
       "      <td>2.00</td>\n",
       "      <td>10.00</td>\n",
       "      <td>0.005000</td>\n",
       "      <td>4.0</td>\n",
       "      <td>40.0</td>\n",
       "      <td>https://doi.org/10.1016/j.microc.2021.106238</td>\n",
       "      <td>J.MICROC.2021.106238</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1174</th>\n",
       "      <td>Au</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>ss-DNA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>17.67000</td>\n",
       "      <td>0.000180</td>\n",
       "      <td>glucose</td>\n",
       "      <td>5.000</td>\n",
       "      <td>25.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>https://doi.org/10.1002/anie.201105121</td>\n",
       "      <td>ANIE.201105121</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1175</th>\n",
       "      <td>Au</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>ds-DNA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>6.98000</td>\n",
       "      <td>0.000530</td>\n",
       "      <td>glucose</td>\n",
       "      <td>5.000</td>\n",
       "      <td>25.00</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>https://doi.org/10.1002/anie.201105121</td>\n",
       "      <td>ANIE.201105121</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1176</th>\n",
       "      <td>Au</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>DNA</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.97000</td>\n",
       "      <td>0.000630</td>\n",
       "      <td>glucose</td>\n",
       "      <td>0.010</td>\n",
       "      <td>0.25</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7.2</td>\n",
       "      <td>25.0</td>\n",
       "      <td>https://doi.org/10.1021/nn102592h</td>\n",
       "      <td>NN102592H</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1177</th>\n",
       "      <td>Pt</td>\n",
       "      <td>laccase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>4.6</td>\n",
       "      <td>4.6</td>\n",
       "      <td>4.6</td>\n",
       "      <td>DNA</td>\n",
       "      <td>0</td>\n",
       "      <td>p-Dimethylaminobenzaldehyde</td>\n",
       "      <td>149.19</td>\n",
       "      <td>0.12000</td>\n",
       "      <td>0.000142</td>\n",
       "      <td>Dichlorophenol  + aminoantipyrine</td>\n",
       "      <td>0.001</td>\n",
       "      <td>2.00</td>\n",
       "      <td>0.32</td>\n",
       "      <td>0.001400</td>\n",
       "      <td>7.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>https://doi.org/10.1007/s10562-017-2106-5</td>\n",
       "      <td>S10562-017-2106-5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1177 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      formula    activity  syngony  length  width  depth surface     polymer  \\\n",
       "0     CoFe2O4  peroxidase      7.0     4.1    4.1    4.1   naked  oleic acid   \n",
       "1     CoFe2O4  peroxidase      7.0    13.8   13.8   13.8   naked  oleic acid   \n",
       "2     CoFe2O4  peroxidase      7.0     4.1    4.1    4.1   naked  oleic acid   \n",
       "3     CoFe2O4  peroxidase      7.0    13.8   13.8   13.8   naked  oleic acid   \n",
       "4     CoFe2O4  peroxidase      7.0    24.5   24.5   24.5   naked  oleic acid   \n",
       "...       ...         ...      ...     ...    ...    ...     ...         ...   \n",
       "1173    Ti3C2  peroxidase      6.0     NaN    NaN    NaN   ssDNA         DNA   \n",
       "1174       Au     oxidase      7.0    13.0   13.0   13.0  ss-DNA           0   \n",
       "1175       Au     oxidase      7.0    13.0   13.0   13.0  ds-DNA           0   \n",
       "1176       Au     oxidase      7.0    13.0   13.0   13.0     DNA           0   \n",
       "1177       Pt     laccase      7.0     4.6    4.6    4.6     DNA           0   \n",
       "\n",
       "                        surfactants  molar_mass        km     v_max  \\\n",
       "0                                 0      282.47   0.00645  1.376300   \n",
       "1                                 0      282.47   0.05537  0.264300   \n",
       "2                                 0      282.47   0.03551  8.363000   \n",
       "3                                 0      282.47   0.22769  0.438200   \n",
       "4                                 0      282.47   0.01725  1.027200   \n",
       "...                             ...         ...       ...       ...   \n",
       "1173  Tetramethylammonium hydroxide         NaN   0.85600  0.000080   \n",
       "1174                              0        0.00  17.67000  0.000180   \n",
       "1175                              0        0.00   6.98000  0.000530   \n",
       "1176                              0         NaN   6.97000  0.000630   \n",
       "1177    p-Dimethylaminobenzaldehyde      149.19   0.12000  0.000142   \n",
       "\n",
       "                          reaction_type  c_min   c_max  c_const      ccat  \\\n",
       "0                            TMB + H2O2  1.500    1.50   100.00  0.000026   \n",
       "1                            TMB + H2O2  1.500    1.50   100.00  0.000026   \n",
       "2                            H2O2 + TMB  0.500   25.00   100.00  0.000026   \n",
       "3                            H2O2 + TMB  0.500   25.00   100.00  0.000026   \n",
       "4                            TMB + H2O2  0.200  100.00    15.00  0.000026   \n",
       "...                                 ...    ...     ...      ...       ...   \n",
       "1173                         OPD + H2O2  0.001    2.00    10.00  0.005000   \n",
       "1174                            glucose  5.000   25.00      NaN       NaN   \n",
       "1175                            glucose  5.000   25.00      NaN       NaN   \n",
       "1176                            glucose  0.010    0.25      NaN       NaN   \n",
       "1177  Dichlorophenol  + aminoantipyrine  0.001    2.00     0.32  0.001400   \n",
       "\n",
       "       ph  temperature                                           link  \\\n",
       "0     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "1     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "2     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "3     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "4     4.0         37.0             https://doi.org/10.1039/C4RA15675G   \n",
       "...   ...          ...                                            ...   \n",
       "1173  4.0         40.0   https://doi.org/10.1016/j.microc.2021.106238   \n",
       "1174  NaN         34.0        https://doi.org/10.1002/anie.201105121    \n",
       "1175  NaN         34.0        https://doi.org/10.1002/anie.201105121    \n",
       "1176  7.2         25.0              https://doi.org/10.1021/nn102592h   \n",
       "1177  7.0         25.0  https://doi.org/10.1007/s10562-017-2106-5       \n",
       "\n",
       "                 article_id  \n",
       "0                C4RA15675G  \n",
       "1                C4RA15675G  \n",
       "2                C4RA15675G  \n",
       "3                C4RA15675G  \n",
       "4                C4RA15675G  \n",
       "...                     ...  \n",
       "1173   J.MICROC.2021.106238  \n",
       "1174        ANIE.201105121   \n",
       "1175        ANIE.201105121   \n",
       "1176              NN102592H  \n",
       "1177  S10562-017-2106-5      \n",
       "\n",
       "[1177 rows x 21 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nanozymes_df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "406"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(nanozymes_df[\"link\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from dotenv import load_dotenv\n",
    "from openai import OpenAI\n",
    "from pydantic import BaseModel\n",
    "\n",
    "load_dotenv(override=True)\n",
    "\n",
    "\n",
    "class Experiment(BaseModel):\n",
    "    formula: str\n",
    "    activity: str\n",
    "    syngony: float\n",
    "    length: float\n",
    "    width: float\n",
    "    depth: float\n",
    "    surface: str\n",
    "    polymer: str\n",
    "    surfactants: str\n",
    "    molar_mass: float\n",
    "    km: float\n",
    "    v_max: float\n",
    "    reaction_type: str\n",
    "    c_min: float\n",
    "    c_max: float\n",
    "    c_const: float\n",
    "    ccat: float\n",
    "    ph: float\n",
    "    temperature: float\n",
    "    experiment_title: str\n",
    "\n",
    "\n",
    "class Response(BaseModel):\n",
    "    experiments: list[Experiment]\n",
    "\n",
    "\n",
    "client = OpenAI()\n",
    "\n",
    "prompt = \"\"\"You are an expert in the field of nanozymes. Your task is to analyze the provided text and extract a structured list of experiments. For each experiment, extract the following parameters, converting units where necessary:\n",
    "\n",
    "- **formula**: The chemical formula of the nanozyme. Do not use subscripts or superscripts; write formulas in plain text as a human would (e.g., H2O2).\n",
    "- **activity**: The type of activity (e.g., peroxidase, oxidase). Write the activity in lowercase.\n",
    "- **syngony**: The crystal system or symmetry. Convert this into the following categories:  \n",
    "  0 - amorphous  \n",
    "  1 - triclinic  \n",
    "  2 - monoclinic  \n",
    "  3 - orthorhombic  \n",
    "  4 - tetragonal  \n",
    "  5 - trigonal  \n",
    "  6 - hexagonal  \n",
    "  7 - cubic  \n",
    "  If no clear match can be made, use the most frequently occurring category from other experiments. If no clear matches in other experiments, choose the one that is most similar in meaning.\n",
    "- **length**: The size of the nanozyme in nanometers (nm).\n",
    "- **width**: The width of the nanozyme in nanometers (nm).\n",
    "- **depth**: The depth of the nanozyme in nanometers (nm).\n",
    "- **surface**: The surface chemistry of the nanozyme. If not specified, use the default value `naked`.\n",
    "- **polymer**: The polymer used in the synthesis. If not specified, use the default value `0`.\n",
    "- **surfactants**: The surfactant used in the synthesis. If not specified, use the default value `0`.\n",
    "- **molar_mass**: The molar mass in grams per mole (g/mol).\n",
    "- **km**: The Michaelis constant Km in millimoles per liter (mM).\n",
    "- **v_max**: The maximum reaction rate Vmax in millimoles per second (mM/s).\n",
    "- **reaction_type**: The type of reaction, where the first component is the substrate and the second is the co-substrate. For example, TMB + H2O2 and H2O2 + TMB should be treated as different reaction types.\n",
    "- **c_min**: The minimum concentration of the substrate in millimoles per liter (mM).\n",
    "- **c_max**: The maximum concentration of the substrate in millimoles per liter (mM).\n",
    "- **c_const**: The concentration of the co-substrate in millimoles per liter (mM).\n",
    "- **ccat**: The concentration of nanoparticles in the measurement of catalytic activity in milligrams per milliliter (mg/mL).\n",
    "- **ph**: The pH at which the catalytic activity was measured.\n",
    "- **temperature**: The temperature at which the research was carried out in degrees Celsius (°C).\n",
    "\n",
    "If any numerical value is missing, please use `nan` to indicate its absence. Ensure that all parameters are accurately extracted and categorized where applicable, and take note that the order of components in the reaction type (substrate + co-substrate) matters. Also, remember to write chemical formulas plainly without subscripts or superscripts (e.g., H2O2 instead of \\(H_2O_2\\)).\n",
    "\n",
    "If the text contains a reaction type written in both forms, such as:\n",
    "- **TMB + H2O2**: TMB as substrate, H2O2 as co-substrate\n",
    "- **H2O2 + TMB**: H2O2 as substrate, TMB as co-substrate\n",
    "\n",
    "You must split this experiment into two distinct experiments, one for each reaction type, reflecting the change in the order of the substrate and co-substrate.\n",
    "\n",
    "If the surface chemistry is not specified, use `naked` as the default value. For both `polymer` and `surfactants`, if they are not mentioned, assign the value `0`.\n",
    "\n",
    "If the syngony category cannot be matched directly, use the most frequently occurring category from other experiments or select the category that is the closest match in meaning.\n",
    "\n",
    "Ensure that all activities are written in lowercase.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 403/403 [48:22<00:00,  7.20s/it] \n"
     ]
    }
   ],
   "source": [
    "assistant_df_list = []\n",
    "failed_md_list = []\n",
    "folder_path = \"./markdown_answers\"\n",
    "for article_name in tqdm(os.listdir(folder_path)):\n",
    "    with open(\n",
    "        os.path.join(folder_path, article_name),\n",
    "        \"r\",\n",
    "        encoding=\"utf-8\",\n",
    "    ) as file:\n",
    "        content = file.read()\n",
    "\n",
    "    completion = client.beta.chat.completions.parse(\n",
    "        model=\"gpt-4o-2024-08-06\",\n",
    "        messages=[\n",
    "            {\"role\": \"system\", \"content\": prompt},\n",
    "            {\"role\": \"user\", \"content\": f\"Text:\\n{content}\"},\n",
    "        ],\n",
    "        response_format=Response,\n",
    "    )\n",
    "    message = completion.choices[0].message\n",
    "    if not message.parsed:\n",
    "        print(article_name, message.refusal)\n",
    "        failed_md_list.append(article_name)\n",
    "        continue\n",
    "    data = [experiment.dict() for experiment in message.parsed.experiments]\n",
    "    article_df = pd.DataFrame(data)\n",
    "    article_df[\"assistant_answer\"] = article_name\n",
    "    article_df[\"article_id\"] = extract_id_from_filename(article_name)\n",
    "    assistant_df_list.append(article_df)\n",
    "\n",
    "assistant_df = pd.concat(assistant_df_list).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "failed_md_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>formula</th>\n",
       "      <th>activity</th>\n",
       "      <th>syngony</th>\n",
       "      <th>length</th>\n",
       "      <th>width</th>\n",
       "      <th>depth</th>\n",
       "      <th>surface</th>\n",
       "      <th>polymer</th>\n",
       "      <th>surfactants</th>\n",
       "      <th>molar_mass</th>\n",
       "      <th>km</th>\n",
       "      <th>v_max</th>\n",
       "      <th>reaction_type</th>\n",
       "      <th>c_min</th>\n",
       "      <th>c_max</th>\n",
       "      <th>c_const</th>\n",
       "      <th>ccat</th>\n",
       "      <th>ph</th>\n",
       "      <th>temperature</th>\n",
       "      <th>experiment_title</th>\n",
       "      <th>assistant_answer</th>\n",
       "      <th>article_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MnO2</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.03000</td>\n",
       "      <td>0.046000</td>\n",
       "      <td>ABTS + H2O2</td>\n",
       "      <td>0.005</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0100</td>\n",
       "      <td>3.8</td>\n",
       "      <td>25.0</td>\n",
       "      <td>MnO2 Nanozyme with ABTS</td>\n",
       "      <td>03067319.2019.1599875.md</td>\n",
       "      <td>03067319.2019.1599875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MnO2</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.02700</td>\n",
       "      <td>0.113000</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>0.005</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0100</td>\n",
       "      <td>3.8</td>\n",
       "      <td>25.0</td>\n",
       "      <td>MnO2 Nanozyme with TMB</td>\n",
       "      <td>03067319.2019.1599875.md</td>\n",
       "      <td>03067319.2019.1599875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Fe3O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00275</td>\n",
       "      <td>0.000014</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.0200</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 1: Fe₃O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CoFe2O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.2</td>\n",
       "      <td>2.2</td>\n",
       "      <td>2.2</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00521</td>\n",
       "      <td>0.000239</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.0200</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 2: CoFe₂O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CuFe2O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.3</td>\n",
       "      <td>2.3</td>\n",
       "      <td>2.3</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00118</td>\n",
       "      <td>0.000082</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.0200</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 3: CuFe₂O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1637</th>\n",
       "      <td>AuPt</td>\n",
       "      <td>peroxidase-like</td>\n",
       "      <td>7.0</td>\n",
       "      <td>23.6</td>\n",
       "      <td>23.6</td>\n",
       "      <td>23.6</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>ascorbic acid</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.0013</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 4</td>\n",
       "      <td>srep40103.md</td>\n",
       "      <td>SREP40103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1638</th>\n",
       "      <td>AuPt</td>\n",
       "      <td>oxidase-like</td>\n",
       "      <td>7.0</td>\n",
       "      <td>23.6</td>\n",
       "      <td>23.6</td>\n",
       "      <td>23.6</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>ascorbic acid</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>TMB</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0013</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 5</td>\n",
       "      <td>srep40103.md</td>\n",
       "      <td>SREP40103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1639</th>\n",
       "      <td>CuO</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>79.545</td>\n",
       "      <td>0.50000</td>\n",
       "      <td>1.200000</td>\n",
       "      <td>H2O2 + TMB</td>\n",
       "      <td>0.100</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0500</td>\n",
       "      <td>7.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 1: CuO Nanozyme</td>\n",
       "      <td>thno.19257.md</td>\n",
       "      <td>THNO.19257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1640</th>\n",
       "      <td>Fe3O4</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>PEG</td>\n",
       "      <td>PEG</td>\n",
       "      <td>0</td>\n",
       "      <td>231.532</td>\n",
       "      <td>3.50000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>TMB + NH3</td>\n",
       "      <td>0.010</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.1000</td>\n",
       "      <td>8.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>Experiment 2: Fe3O4 Nanozyme</td>\n",
       "      <td>thno.19257.md</td>\n",
       "      <td>THNO.19257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1641</th>\n",
       "      <td>CeO2</td>\n",
       "      <td>superoxide dismutase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>172.114</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>O2- + O2</td>\n",
       "      <td>0.050</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.0200</td>\n",
       "      <td>6.5</td>\n",
       "      <td>30.0</td>\n",
       "      <td>Experiment 3: CeO2 Nanozyme</td>\n",
       "      <td>thno.19257.md</td>\n",
       "      <td>THNO.19257</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1642 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           formula              activity  syngony  length  width  depth  \\\n",
       "0             MnO2               oxidase      0.0     0.0    0.0    0.0   \n",
       "1             MnO2               oxidase      0.0     0.0    0.0    0.0   \n",
       "2       Fe3O4-DOPA            peroxidase      7.0     3.5    3.5    3.5   \n",
       "3     CoFe2O4-DOPA            peroxidase      7.0     2.2    2.2    2.2   \n",
       "4     CuFe2O4-DOPA            peroxidase      7.0     2.3    2.3    2.3   \n",
       "...            ...                   ...      ...     ...    ...    ...   \n",
       "1637          AuPt       peroxidase-like      7.0    23.6   23.6   23.6   \n",
       "1638          AuPt          oxidase-like      7.0    23.6   23.6   23.6   \n",
       "1639           CuO            peroxidase      7.0    50.0   20.0   20.0   \n",
       "1640         Fe3O4               oxidase      7.0    15.0   15.0   15.0   \n",
       "1641          CeO2  superoxide dismutase      7.0    25.0   25.0   25.0   \n",
       "\n",
       "              surface polymer    surfactants  molar_mass       km     v_max  \\\n",
       "0               naked       0              0       0.000  0.03000  0.046000   \n",
       "1               naked       0              0       0.000  0.02700  0.113000   \n",
       "2     dopamine-capped       0              0       0.000  0.00275  0.000014   \n",
       "3     dopamine-capped       0              0       0.000  0.00521  0.000239   \n",
       "4     dopamine-capped       0              0       0.000  0.00118  0.000082   \n",
       "...               ...     ...            ...         ...      ...       ...   \n",
       "1637            naked       0  ascorbic acid       0.000  0.00000  0.000000   \n",
       "1638            naked       0  ascorbic acid       0.000  0.00000  0.000000   \n",
       "1639            naked       0              0      79.545  0.50000  1.200000   \n",
       "1640              PEG     PEG              0     231.532  3.50000  3.000000   \n",
       "1641            naked       0              0     172.114  1.00000  0.500000   \n",
       "\n",
       "     reaction_type  c_min  c_max  c_const    ccat   ph  temperature  \\\n",
       "0      ABTS + H2O2  0.005    1.0      0.0  0.0100  3.8         25.0   \n",
       "1       TMB + H2O2  0.005    1.0      0.0  0.0100  3.8         25.0   \n",
       "2      H2O2 + ABTS  0.000    0.0    100.0  0.0200  4.6         25.0   \n",
       "3      H2O2 + ABTS  0.000    0.0    100.0  0.0200  4.6         25.0   \n",
       "4      H2O2 + ABTS  0.000    0.0    100.0  0.0200  4.6         25.0   \n",
       "...            ...    ...    ...      ...     ...  ...          ...   \n",
       "1637    TMB + H2O2  0.000    0.0    100.0  0.0013  0.0         25.0   \n",
       "1638           TMB  0.000    0.0      0.0  0.0013  0.0         25.0   \n",
       "1639    H2O2 + TMB  0.100    5.0      1.0  0.0500  7.0         25.0   \n",
       "1640     TMB + NH3  0.010   10.0      0.1  0.1000  8.0         37.0   \n",
       "1641      O2- + O2  0.050    2.0      0.5  0.0200  6.5         30.0   \n",
       "\n",
       "                          experiment_title          assistant_answer  \\\n",
       "0                  MnO2 Nanozyme with ABTS  03067319.2019.1599875.md   \n",
       "1                   MnO2 Nanozyme with TMB  03067319.2019.1599875.md   \n",
       "2       Experiment 1: Fe₃O₄-DOPA Nanozymes       1361-6463.aa5bf6.md   \n",
       "3     Experiment 2: CoFe₂O₄-DOPA Nanozymes       1361-6463.aa5bf6.md   \n",
       "4     Experiment 3: CuFe₂O₄-DOPA Nanozymes       1361-6463.aa5bf6.md   \n",
       "...                                    ...                       ...   \n",
       "1637                          Experiment 4              srep40103.md   \n",
       "1638                          Experiment 5              srep40103.md   \n",
       "1639            Experiment 1: CuO Nanozyme             thno.19257.md   \n",
       "1640          Experiment 2: Fe3O4 Nanozyme             thno.19257.md   \n",
       "1641           Experiment 3: CeO2 Nanozyme             thno.19257.md   \n",
       "\n",
       "                 article_id  \n",
       "0     03067319.2019.1599875  \n",
       "1     03067319.2019.1599875  \n",
       "2          1361-6463.AA5BF6  \n",
       "3          1361-6463.AA5BF6  \n",
       "4          1361-6463.AA5BF6  \n",
       "...                     ...  \n",
       "1637              SREP40103  \n",
       "1638              SREP40103  \n",
       "1639             THNO.19257  \n",
       "1640             THNO.19257  \n",
       "1641             THNO.19257  \n",
       "\n",
       "[1642 rows x 22 columns]"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "assistant_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "assistant_df = assistant_df.drop_duplicates(\n",
    "    subset=[\n",
    "        \"formula\",\n",
    "        \"activity\",\n",
    "        \"syngony\",\n",
    "        \"length\",\n",
    "        \"width\",\n",
    "        \"depth\",\n",
    "        \"surface\",\n",
    "        \"polymer\",\n",
    "        \"surfactants\",\n",
    "        \"molar_mass\",\n",
    "        \"km\",\n",
    "        \"v_max\",\n",
    "        \"reaction_type\",\n",
    "        \"c_min\",\n",
    "        \"c_max\",\n",
    "        \"c_const\",\n",
    "        \"ccat\",\n",
    "        \"ph\",\n",
    "        \"temperature\",\n",
    "        \"assistant_answer\",\n",
    "        \"article_id\",\n",
    "    ]\n",
    ").reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [],
   "source": [
    "assistant_df.to_csv(\"assistant_df_21_08_2024_v2.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "assistant_df = pd.read_csv(\"assistant_df_21_08_2024_v2.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>formula</th>\n",
       "      <th>activity</th>\n",
       "      <th>syngony</th>\n",
       "      <th>length</th>\n",
       "      <th>width</th>\n",
       "      <th>depth</th>\n",
       "      <th>surface</th>\n",
       "      <th>polymer</th>\n",
       "      <th>surfactants</th>\n",
       "      <th>molar_mass</th>\n",
       "      <th>km</th>\n",
       "      <th>v_max</th>\n",
       "      <th>reaction_type</th>\n",
       "      <th>c_min</th>\n",
       "      <th>c_max</th>\n",
       "      <th>c_const</th>\n",
       "      <th>ccat</th>\n",
       "      <th>ph</th>\n",
       "      <th>temperature</th>\n",
       "      <th>experiment_title</th>\n",
       "      <th>assistant_answer</th>\n",
       "      <th>article_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>MnO2</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.03000</td>\n",
       "      <td>0.046000</td>\n",
       "      <td>ABTS + H2O2</td>\n",
       "      <td>0.005</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0100</td>\n",
       "      <td>3.8</td>\n",
       "      <td>25.0</td>\n",
       "      <td>MnO2 Nanozyme with ABTS</td>\n",
       "      <td>03067319.2019.1599875.md</td>\n",
       "      <td>03067319.2019.1599875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>MnO2</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.02700</td>\n",
       "      <td>0.113000</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>0.005</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0100</td>\n",
       "      <td>3.8</td>\n",
       "      <td>25.0</td>\n",
       "      <td>MnO2 Nanozyme with TMB</td>\n",
       "      <td>03067319.2019.1599875.md</td>\n",
       "      <td>03067319.2019.1599875</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Fe3O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00275</td>\n",
       "      <td>0.000014</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.0200</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 1: Fe₃O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CoFe2O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.2</td>\n",
       "      <td>2.2</td>\n",
       "      <td>2.2</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00521</td>\n",
       "      <td>0.000239</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.0200</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 2: CoFe₂O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CuFe2O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.3</td>\n",
       "      <td>2.3</td>\n",
       "      <td>2.3</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00118</td>\n",
       "      <td>0.000082</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.0</td>\n",
       "      <td>0.0200</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 3: CuFe₂O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1494</th>\n",
       "      <td>AuPt</td>\n",
       "      <td>oxidase-like</td>\n",
       "      <td>7.0</td>\n",
       "      <td>23.6</td>\n",
       "      <td>23.6</td>\n",
       "      <td>23.6</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>ascorbic acid</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>TMB</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0013</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 2</td>\n",
       "      <td>srep40103.md</td>\n",
       "      <td>SREP40103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1495</th>\n",
       "      <td>AuPt</td>\n",
       "      <td>ascorbic acid oxidase-like</td>\n",
       "      <td>7.0</td>\n",
       "      <td>23.6</td>\n",
       "      <td>23.6</td>\n",
       "      <td>23.6</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>ascorbic acid</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.00000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>ascorbic acid</td>\n",
       "      <td>0.000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0013</td>\n",
       "      <td>0.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 3</td>\n",
       "      <td>srep40103.md</td>\n",
       "      <td>SREP40103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1496</th>\n",
       "      <td>CuO</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>50.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>79.545</td>\n",
       "      <td>0.50000</td>\n",
       "      <td>1.200000</td>\n",
       "      <td>H2O2 + TMB</td>\n",
       "      <td>0.100</td>\n",
       "      <td>5.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0500</td>\n",
       "      <td>7.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 1: CuO Nanozyme</td>\n",
       "      <td>thno.19257.md</td>\n",
       "      <td>THNO.19257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1497</th>\n",
       "      <td>Fe3O4</td>\n",
       "      <td>oxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>PEG</td>\n",
       "      <td>PEG</td>\n",
       "      <td>0</td>\n",
       "      <td>231.532</td>\n",
       "      <td>3.50000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>TMB + NH3</td>\n",
       "      <td>0.010</td>\n",
       "      <td>10.0</td>\n",
       "      <td>0.1</td>\n",
       "      <td>0.1000</td>\n",
       "      <td>8.0</td>\n",
       "      <td>37.0</td>\n",
       "      <td>Experiment 2: Fe3O4 Nanozyme</td>\n",
       "      <td>thno.19257.md</td>\n",
       "      <td>THNO.19257</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1498</th>\n",
       "      <td>CeO2</td>\n",
       "      <td>superoxide dismutase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>172.114</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>O2- + O2</td>\n",
       "      <td>0.050</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.5</td>\n",
       "      <td>0.0200</td>\n",
       "      <td>6.5</td>\n",
       "      <td>30.0</td>\n",
       "      <td>Experiment 3: CeO2 Nanozyme</td>\n",
       "      <td>thno.19257.md</td>\n",
       "      <td>THNO.19257</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1499 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           formula                    activity  syngony  length  width  depth  \\\n",
       "0             MnO2                     oxidase      0.0     0.0    0.0    0.0   \n",
       "1             MnO2                     oxidase      0.0     0.0    0.0    0.0   \n",
       "2       Fe3O4-DOPA                  peroxidase      7.0     3.5    3.5    3.5   \n",
       "3     CoFe2O4-DOPA                  peroxidase      7.0     2.2    2.2    2.2   \n",
       "4     CuFe2O4-DOPA                  peroxidase      7.0     2.3    2.3    2.3   \n",
       "...            ...                         ...      ...     ...    ...    ...   \n",
       "1494          AuPt                oxidase-like      7.0    23.6   23.6   23.6   \n",
       "1495          AuPt  ascorbic acid oxidase-like      7.0    23.6   23.6   23.6   \n",
       "1496           CuO                  peroxidase      7.0    50.0   20.0   20.0   \n",
       "1497         Fe3O4                     oxidase      7.0    15.0   15.0   15.0   \n",
       "1498          CeO2        superoxide dismutase      7.0    25.0   25.0   25.0   \n",
       "\n",
       "              surface polymer    surfactants  molar_mass       km     v_max  \\\n",
       "0               naked       0              0       0.000  0.03000  0.046000   \n",
       "1               naked       0              0       0.000  0.02700  0.113000   \n",
       "2     dopamine-capped       0              0       0.000  0.00275  0.000014   \n",
       "3     dopamine-capped       0              0       0.000  0.00521  0.000239   \n",
       "4     dopamine-capped       0              0       0.000  0.00118  0.000082   \n",
       "...               ...     ...            ...         ...      ...       ...   \n",
       "1494            naked       0  ascorbic acid       0.000  0.00000  0.000000   \n",
       "1495            naked       0  ascorbic acid       0.000  0.00000  0.000000   \n",
       "1496            naked       0              0      79.545  0.50000  1.200000   \n",
       "1497              PEG     PEG              0     231.532  3.50000  3.000000   \n",
       "1498            naked       0              0     172.114  1.00000  0.500000   \n",
       "\n",
       "      reaction_type  c_min  c_max  c_const    ccat   ph  temperature  \\\n",
       "0       ABTS + H2O2  0.005    1.0      0.0  0.0100  3.8         25.0   \n",
       "1        TMB + H2O2  0.005    1.0      0.0  0.0100  3.8         25.0   \n",
       "2       H2O2 + ABTS  0.000    0.0    100.0  0.0200  4.6         25.0   \n",
       "3       H2O2 + ABTS  0.000    0.0    100.0  0.0200  4.6         25.0   \n",
       "4       H2O2 + ABTS  0.000    0.0    100.0  0.0200  4.6         25.0   \n",
       "...             ...    ...    ...      ...     ...  ...          ...   \n",
       "1494            TMB  0.000    0.0      0.0  0.0013  0.0         25.0   \n",
       "1495  ascorbic acid  0.000    0.0      0.0  0.0013  0.0         25.0   \n",
       "1496     H2O2 + TMB  0.100    5.0      1.0  0.0500  7.0         25.0   \n",
       "1497      TMB + NH3  0.010   10.0      0.1  0.1000  8.0         37.0   \n",
       "1498       O2- + O2  0.050    2.0      0.5  0.0200  6.5         30.0   \n",
       "\n",
       "                          experiment_title          assistant_answer  \\\n",
       "0                  MnO2 Nanozyme with ABTS  03067319.2019.1599875.md   \n",
       "1                   MnO2 Nanozyme with TMB  03067319.2019.1599875.md   \n",
       "2       Experiment 1: Fe₃O₄-DOPA Nanozymes       1361-6463.aa5bf6.md   \n",
       "3     Experiment 2: CoFe₂O₄-DOPA Nanozymes       1361-6463.aa5bf6.md   \n",
       "4     Experiment 3: CuFe₂O₄-DOPA Nanozymes       1361-6463.aa5bf6.md   \n",
       "...                                    ...                       ...   \n",
       "1494                          Experiment 2              srep40103.md   \n",
       "1495                          Experiment 3              srep40103.md   \n",
       "1496            Experiment 1: CuO Nanozyme             thno.19257.md   \n",
       "1497          Experiment 2: Fe3O4 Nanozyme             thno.19257.md   \n",
       "1498           Experiment 3: CeO2 Nanozyme             thno.19257.md   \n",
       "\n",
       "                 article_id  \n",
       "0     03067319.2019.1599875  \n",
       "1     03067319.2019.1599875  \n",
       "2          1361-6463.AA5BF6  \n",
       "3          1361-6463.AA5BF6  \n",
       "4          1361-6463.AA5BF6  \n",
       "...                     ...  \n",
       "1494              SREP40103  \n",
       "1495              SREP40103  \n",
       "1496             THNO.19257  \n",
       "1497             THNO.19257  \n",
       "1498             THNO.19257  \n",
       "\n",
       "[1499 rows x 22 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "assistant_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['C8CC07800A',\n",
       " 'C1CY00124H',\n",
       " 'C8NJ00097B',\n",
       " '1361-6528.AADDC2',\n",
       " 'ACSAMI.0C12593',\n",
       " 'J.BIOMATERIALS.2010.09.040',\n",
       " 'C5RA07636F',\n",
       " 'ANGE.201904751',\n",
       " 'J.SNB.2016.07.168',\n",
       " 'SMLL.201903182',\n",
       " 'J.BIOS.2018.08.004',\n",
       " 'J.SAA.2019.117412',\n",
       " 'C4NR04115A',\n",
       " 'C8TB01948G',\n",
       " 'J.TALANTA.2020.121680',\n",
       " 'C6RA00368K',\n",
       " 'J.APSUSC.2016.12.067',\n",
       " 'ANIE.200805279',\n",
       " 'ACS.LANGMUIR.7B03430',\n",
       " 'J.MSEC.2015.10.046',\n",
       " 'CHEM.202100567',\n",
       " 'C6CC00194G',\n",
       " 'ACSANM.3C01652',\n",
       " 'C8CC07062H',\n",
       " 'J.CEJ.2017.08.026',\n",
       " 'ACS.INORGCHEM.0C03355',\n",
       " 'D3NJ00136A',\n",
       " 'J.JTICE.2021.03.029',\n",
       " 'NANO9020210',\n",
       " 'IE403554V',\n",
       " 'J.MOLCATA.2013.05.016',\n",
       " 'C7AY00750G',\n",
       " 'CHEM.201001789',\n",
       " 'ANIE.201909729',\n",
       " 'J.JALLCOM.2016.04.269',\n",
       " 'J.ACA.2020.01.035',\n",
       " 'J.SAA.2017.06.006',\n",
       " 'J.JCIS.2019.01.061',\n",
       " 'J.COLSURFB.2017.02.004',\n",
       " 'J.ACA.2012.11.056',\n",
       " 'AOC.4465',\n",
       " 'ACSAMI.0C01789',\n",
       " 'C5RA11014A',\n",
       " 'S00604-017-2552-1',\n",
       " 'J.SNB.2019.04.020',\n",
       " 'J.TALANTA.2013.01.032',\n",
       " 'S41467-021-23737-1',\n",
       " 'ACSANM.8B00945',\n",
       " 'J.JALLCOM.2019.01.225',\n",
       " 'SREP40103',\n",
       " 'C7NJ00292K',\n",
       " 'CBIC.202000147',\n",
       " 'ADHM.202300490',\n",
       " 'J.SNB.2017.11.057',\n",
       " 'ACSSUSCHEMENG.1C07568',\n",
       " 'C8TB01706A',\n",
       " 'S00604-020-04376-7',\n",
       " 'J.JCIS.2015.08.032',\n",
       " 'J.TALANTA.2018.01.080',\n",
       " 'SMLL.202003496',\n",
       " 'J.SNB.2014.12.052',\n",
       " 'C7RA10370K',\n",
       " 'J.ACA.2022.339564',\n",
       " 'C5AN01103E',\n",
       " 'C9NJ05955E',\n",
       " 'ANIE.201800681',\n",
       " 'C6AY02275H',\n",
       " 'C4CC06684G',\n",
       " 'S12951-019-0487-X',\n",
       " 'S00604-019-3293-0',\n",
       " 'C5NR01728A',\n",
       " 'JF500950P',\n",
       " 'S00604-017-2582-8',\n",
       " 'ACSSUSCHEMENG.0C06920',\n",
       " 'S0039914021005683',\n",
       " 'EJIC.202200387',\n",
       " 'S10853-018-2657-X',\n",
       " 'C9NJ03527C',\n",
       " 'ACSANM.8B00153',\n",
       " 'ADFM.201001302',\n",
       " 'J.SNB.2020.128642',\n",
       " 'S00604-017-2198-Z',\n",
       " 'C6NR00860G',\n",
       " 'S00604-015-1690-6',\n",
       " 'C7TB02676E',\n",
       " 'C4NR03393K',\n",
       " 'J.BIOS.2019.111450',\n",
       " 'J.JHAZMAT.2020.125019',\n",
       " 'ANIE.201708573',\n",
       " 'J.MOLCATA.2012.03.007',\n",
       " 'ACSNANO.9B04366',\n",
       " 'AC503544W',\n",
       " 'J.JTICE.2015.08.021',\n",
       " 'THNO.19257',\n",
       " 'C8TB01853G',\n",
       " 'C2AN35072F',\n",
       " 'NN102592H',\n",
       " 'S00604-016-1915-3',\n",
       " 'ACSSUSCHEMENG.8B04067',\n",
       " 'J.SNB.2012.11.074',\n",
       " 'J.SNB.2022.132066',\n",
       " 'D0NJ05026A',\n",
       " 'ACS.IECR.7B04821',\n",
       " 'J.TALANTA.2011.09.026',\n",
       " 'J.SNB.2012.07.070',\n",
       " 'D1NJ00819F',\n",
       " 'J.MICROC.2019.104352',\n",
       " 'J.COLSURFB.2021.111783',\n",
       " 'S00216-021-03347-Y',\n",
       " 'ACSOMEGA.0C00147',\n",
       " 'D0NJ02795B',\n",
       " 'ADFM.201801484',\n",
       " 'J.MSEC.2015.05.028',\n",
       " 'AM300408R',\n",
       " 'S00604-017-2326-9',\n",
       " 'ACS.JPCC.1C10325',\n",
       " 'ACSAMI.9B16279',\n",
       " 'ACSANM.8B00578',\n",
       " 'J.JCIS.2018.12.093',\n",
       " 'S16040584',\n",
       " 'J.SNB.2016.08.094',\n",
       " 'C3TA15051H',\n",
       " 'ADMA.201203218',\n",
       " 'J.SNB.2018.04.150',\n",
       " 'S11051-018-4271-X',\n",
       " 'S41664-019-00104-0',\n",
       " 'J.BIOS.2019.111983',\n",
       " 'C5NR08038J',\n",
       " 'D0TB03005H',\n",
       " 'ACSAMI.7B17916',\n",
       " 'J.JCIS.2022.01.041',\n",
       " 'J.COLSURFA.2021.126585',\n",
       " 'S00604-020-04451-Z',\n",
       " 'CHEMOSENSORS1000359',\n",
       " 'CCTC.201100064',\n",
       " 'C6TB00422A',\n",
       " 'S00604-016-1955-8',\n",
       " 'ACSAMI.5B05180',\n",
       " 'SMLL.202001099',\n",
       " 'J.SINTL.2020.100031',\n",
       " 'S13205-017-1082-1',\n",
       " 'J.BIOS.2014.08.078',\n",
       " 'JACS.0C12605',\n",
       " 'C9AN01262A',\n",
       " 'CHEM.201101191',\n",
       " 'S41596-018-0001-1',\n",
       " 'D0TA09247A',\n",
       " 'JACS.5B12070',\n",
       " 'CHEM.202104247',\n",
       " 'J.MICROC.2023.108562',\n",
       " 'J.BIOS.2014.07.001',\n",
       " 'J.JHAZMAT.2020.123939',\n",
       " 'C6QM00149A',\n",
       " 'C7NJ03880A',\n",
       " 'J.TALANTA.2021.122594',\n",
       " 'NNANO.2007.260',\n",
       " 'C2TB00389A',\n",
       " 'ACSANM.8B00895',\n",
       " 'J.CEJ.2023.142494',\n",
       " 'ACSANM.0C02807',\n",
       " 'IE504114V',\n",
       " 'J.SNB.2018.05.108',\n",
       " 'J.COLSURFA.2022.129887',\n",
       " 'ACSNANO.1C11012',\n",
       " 'ACSNANO.1C03128',\n",
       " 'ASIA.201500942',\n",
       " 'JCCS.201700031',\n",
       " 'C5NJ02705E',\n",
       " 'J.APSUSC.2019.03.337',\n",
       " 'ACSANM.2C05400',\n",
       " 'C9TB00730J',\n",
       " 'S12274-016-1395-0',\n",
       " 'CRAT.201300440',\n",
       " 'J.ACA.2014.10.011',\n",
       " 'C6RA18050G',\n",
       " 'C8NR06162A',\n",
       " 'C5AN00031A',\n",
       " 'ACSAMI.0C07886',\n",
       " 'J.TALANTA.2018.07.073',\n",
       " 'ADFM.201800018',\n",
       " 'C0JM00174K',\n",
       " 'ACSNANO.5B03525',\n",
       " 'J.JPCS.2021.110534',\n",
       " 'D0CC06209J',\n",
       " 'J.MICROC.2019.104019',\n",
       " 'C6RA14856E',\n",
       " 'J.MOLCATA.2012.04.011',\n",
       " 'D0TB01337D',\n",
       " 'CHEM.201200643',\n",
       " 'J.COLSURFB.2019.110764',\n",
       " 'C2CC17013B',\n",
       " 'C1CC11943E',\n",
       " 'J.ACA.2016.10.013',\n",
       " 'C4CC01703J',\n",
       " 'J.SNB.2019.126876',\n",
       " 'J.TALANTA.2019.06.003',\n",
       " 'J.SURFIN.2021.101109',\n",
       " 'ACSANM.1C01457',\n",
       " 'S11051-016-3357-6',\n",
       " 'CHEM.201800770',\n",
       " 'C3NR06896J',\n",
       " 'J.BIOS.2014.11.032',\n",
       " 'S00604-021-04942-7',\n",
       " 'J.ACA.2016.11.035',\n",
       " 'J.SNB.2017.06.175',\n",
       " 'C1CC14300J',\n",
       " 'ACSABM.0C00605',\n",
       " 'J.APCATA.2012.01.025',\n",
       " 'J.SNB.2021.129560',\n",
       " 'J.JCIS.2017.07.014',\n",
       " 'JACS.0C09567',\n",
       " 'C4RA12596G',\n",
       " 'J.JTICE.2017.12.011',\n",
       " 'CM8031863',\n",
       " 'S11434-016-1193-9',\n",
       " 'J.SNB.2016.04.041',\n",
       " 'NANO10061015',\n",
       " 'J.MATLET.2013.04.020',\n",
       " 'C3NR06533B',\n",
       " 'ACSAMI.6B05354',\n",
       " 'ANIE.201105121',\n",
       " 'C6CC08542C',\n",
       " 'J.ACA.2017.09.011',\n",
       " 'J.COLSURFA.2021.126985',\n",
       " 'J.BIOMATERIALS.2015.04.039',\n",
       " 'ACSNANO.6B05810',\n",
       " 'AC300939Z',\n",
       " 'S00604-015-1697-Z',\n",
       " 'J.TALANTA.2019.120707',\n",
       " 'C3CC41569D',\n",
       " 'C8DT01146J',\n",
       " 'J.COLSURFA.2020.125397',\n",
       " 'C6TB01233G',\n",
       " 'J.ACA.2015.04.052',\n",
       " 'J.MSEC.2018.10.038',\n",
       " 'J.APCATB.2012.07.005',\n",
       " 'S00216-017-0372-0',\n",
       " 'J.SNB.2017.02.059',\n",
       " 'C7NR03399K',\n",
       " 'ADHM.201500173',\n",
       " 'ACS.ANALCHEM.7B02880',\n",
       " 'ACS.ANALCHEM.5B02728',\n",
       " 'J.APCATB.2019.118256',\n",
       " 'AM501830V',\n",
       " 'C9TB00989B',\n",
       " 'JACS.6B07590',\n",
       " 'C6NR02730J',\n",
       " 'ACSSUSCHEMENG.0C03822',\n",
       " 'CBIC.202000066',\n",
       " 'C6TB02750D',\n",
       " 'ACSAMI.7B05036',\n",
       " 'C2AY25511A',\n",
       " 'J.BIOS.2021.113724',\n",
       " 'J.SNB.2019.126928',\n",
       " 'J.SNB.2019.01.068',\n",
       " 'C5NR05675F',\n",
       " 'S12274-020-2680-5',\n",
       " 'C4TB00968A',\n",
       " 'C5TB00684H',\n",
       " 'J.PROCBIO.2019.05.014',\n",
       " 'ACSAMI.1C10600',\n",
       " 'JNN.2018.13977',\n",
       " 'C8NJ04647F',\n",
       " 'ACSAMI.7B09861',\n",
       " 'J.BIOS.2013.11.040',\n",
       " 'S10562-017-2106-5',\n",
       " 'S12274-018-2154-1',\n",
       " 'J.SNB.2017.06.096',\n",
       " 'ACSAMI.7B01616',\n",
       " 'D0TB00239A',\n",
       " 'D0DT02395G',\n",
       " 'J.TALANTA.2020.121990',\n",
       " 'J.SNB.2017.07.108',\n",
       " 'J.ACA.2020.01.043',\n",
       " 'NANO12050755',\n",
       " '1361-6463.AA5BF6',\n",
       " '5416963',\n",
       " 'J.SNB.2018.09.120',\n",
       " 'J.BEJ.2019.107384',\n",
       " 'C5DT04192A',\n",
       " 'ACS.ANALCHEM.9B04116',\n",
       " 'J.BIOMATERIALS.2009.05.005',\n",
       " 'CHEM.200802158',\n",
       " 'SLCT.201900681',\n",
       " 'J.SAA.2020.118499',\n",
       " 'ACSAMI.0C10981',\n",
       " 'D0NR04177G',\n",
       " 'C6AN02722A',\n",
       " 'S00604-015-1506-8',\n",
       " 'ANGE.202115939',\n",
       " 'C8CC09799B',\n",
       " 'C6TB01881E',\n",
       " 'ACS.ANALCHEM.2C05425',\n",
       " 'NANO10020313',\n",
       " 'ANIE.202106750',\n",
       " 'ACSANM.2C03340',\n",
       " 'J.SNB.2012.03.045',\n",
       " 'J.TALANTA.2018.01.019',\n",
       " 'S00604-017-2562-Z',\n",
       " 'ACSSENSORS.6B00500',\n",
       " 'J.SNB.2016.09.049',\n",
       " 'C7NR03179C',\n",
       " 'C5DT01585E',\n",
       " 'J.JCIS.2017.11.064',\n",
       " 'C2AN35700C(1)',\n",
       " 'J.SNB.2019.127106',\n",
       " 'J.ACA.2020.07.068',\n",
       " 'C6RA23773H',\n",
       " 'J.BIOS.2020.112342',\n",
       " 'J.APSUSC.2019.02.135',\n",
       " 'C5NJ02547H',\n",
       " 'CNMA.201600268',\n",
       " 'J.APSUSC.2023.157185',\n",
       " 'S00604-018-3185-8',\n",
       " 'J.SNB.2018.08.051',\n",
       " 'ACSABM.0C00606',\n",
       " 'S10853-017-1181-8',\n",
       " 'ACS.INORGCHEM.9B03512',\n",
       " 'C8NJ01190G',\n",
       " 'J.BIOS.2014.07.048',\n",
       " 'SREP35344',\n",
       " 'J.SNB.2021.130266',\n",
       " 'S00604-019-3395-8',\n",
       " 'J.LWT.2021.112821',\n",
       " 'C7NJ01177F',\n",
       " 'C5CC00040H',\n",
       " 'J.SNB.2017.05.069',\n",
       " 'J.APMT.2019.03.009',\n",
       " 'J.SNB.2016.11.145',\n",
       " 'J.BIOS.2019.111756',\n",
       " 'J.NANTOD.2022.101421',\n",
       " 'J.TALANTA.2023.124872',\n",
       " 'D0CC04101G',\n",
       " 'C3RA23215H',\n",
       " 'J.ACA.2013.03.034',\n",
       " 'ACSAMI.5B00023',\n",
       " 'S00604-020-04399-0',\n",
       " 'D0CC04101G(1)',\n",
       " 'ACSAMI.7B13835',\n",
       " 'C5AY01732G',\n",
       " 'ADMA.201405105',\n",
       " 'D1RA03456A',\n",
       " 'J.CCLET.2021.08.017',\n",
       " 'C4RA15675G',\n",
       " 'S00604-019-3624-1',\n",
       " 'ACSNANO.9B09974',\n",
       " 'C6RA16619A',\n",
       " 'C8RA05487H',\n",
       " 'ACSAMI.2C20878',\n",
       " 'J.BIOS.2016.10.082',\n",
       " 'C2NR31716H',\n",
       " 'JACS.7B00601',\n",
       " 'AM406033Q',\n",
       " 'AM405009F',\n",
       " 'J.MATERRESBULL.2015.03.018',\n",
       " 'D0RA05342B',\n",
       " 'ACSSUSCHEMENG.9B04043',\n",
       " 'J.SNB.2020.128850',\n",
       " 'S00216-018-1423-X',\n",
       " 'S41664-019-00100-4',\n",
       " 'C7NJ00899F',\n",
       " 'S1872-2040(17)61083-1',\n",
       " 'LA104566E',\n",
       " 'J.MSEC.2014.04.038',\n",
       " 'C6AY03034C',\n",
       " 'J.BIOS.2011.02.014',\n",
       " 'ACSSUSCHEMENG.9B02459',\n",
       " 'S41467-019-08657-5',\n",
       " 'CJOC.201300683',\n",
       " 'D2NR01375D',\n",
       " 'PPSC.201500043',\n",
       " 'ACSABM.9B01107',\n",
       " 'ACSNANO.6B06297',\n",
       " 'ACSAMI.5B01271',\n",
       " 'C6RA00963H',\n",
       " 'C8NJ00324F',\n",
       " 'C9RA01227C',\n",
       " 'C2AN35700C',\n",
       " 'J.MICROC.2021.106238',\n",
       " '03067319.2019.1599875',\n",
       " 'ADFM.202001933',\n",
       " 'S00604-020-04298-4',\n",
       " 'J.COLSURFA.2021.126427',\n",
       " 'J.TALANTA.2021.122647',\n",
       " 'J.SAA.2023.123003',\n",
       " 'J.TALANTA.2020.121142',\n",
       " 'C2CC32833J',\n",
       " 'J.BIOS.2014.08.062',\n",
       " 'J.JCIS.2021.06.170',\n",
       " 'J.TALANTA.2021.122337',\n",
       " 'J.BIOMATERIALS.2010.11.004',\n",
       " 'J.TALANTA.2015.03.050',\n",
       " 'JACS.5B09337',\n",
       " 'S00604-015-1670-X',\n",
       " 'ACS.CHEMMATER.6B04283',\n",
       " 'J.COLSURFA.2011.08.008',\n",
       " 'ACSSUSCHEMENG.1C00723',\n",
       " 'S00604-021-05112-5',\n",
       " 'S00216-019-02268-1']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(set(assistant_df[\"article_id\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Коэффициент Жаккара: 0.42857142857142855\n",
      "Сопоставленные элементы (a1): [2.00001, 2.0001, 3.0]\n",
      "Несопоставленные элементы из list1 (a2): [1.0001, 4.0]\n",
      "Несопоставленные элементы из list2 (a3): [3.0001, 5.0]\n"
     ]
    }
   ],
   "source": [
    "def jaccard_index_with_floats(list1, list2, epsilon=0.01):\n",
    "    a1 = []\n",
    "    a2 = list1.copy()\n",
    "    a3 = list2.copy()\n",
    "\n",
    "    for x in list1:\n",
    "        matched = False\n",
    "        for y in a3:\n",
    "            if np.isclose(x, y, atol=epsilon):\n",
    "                a1.append(x)\n",
    "                a2.remove(x)\n",
    "                a3.remove(y)\n",
    "                matched = True\n",
    "                break\n",
    "\n",
    "    numerator = len(a1)\n",
    "    denominator = len(a1) + len(a2) + len(a3)\n",
    "\n",
    "    if denominator == 0:\n",
    "        return 0.0\n",
    "\n",
    "    jaccard_coefficient = numerator / denominator\n",
    "\n",
    "    return jaccard_coefficient, a1, a2, a3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Коэффициент Жаккара: 0.42857142857142855\n",
      "Сопоставленные элементы (a1): ['banana', 'banana', 'orange']\n",
      "Несопоставленные элементы из list1 (a2): ['apple', 'apple']\n",
      "Несопоставленные элементы из list2 (a3): ['grape', 'banana']\n"
     ]
    }
   ],
   "source": [
    "def jaccard_index_with_strings(list1, list2):\n",
    "    a1 = []\n",
    "    a2 = list1.copy()\n",
    "    a3 = list2.copy()\n",
    "\n",
    "    for x in list1:\n",
    "        matched = False\n",
    "        for y in a3:\n",
    "            if str(x).upper() == str(y).upper():\n",
    "                a1.append(x)\n",
    "                a2.remove(x)\n",
    "                a3.remove(y)\n",
    "                matched = True\n",
    "                break\n",
    "\n",
    "    numerator = len(a1)\n",
    "    denominator = len(a1) + len(a2) + len(a3)\n",
    "\n",
    "    if denominator == 0:\n",
    "        return 0.0\n",
    "\n",
    "    jaccard_coefficient = numerator / denominator\n",
    "\n",
    "    return jaccard_coefficient, a1, a2, a3\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "assistant_df[\"polymer\"] = assistant_df[\"polymer\"].astype(str)\n",
    "assistant_df[\"surfactants\"] = assistant_df[\"surfactants\"].astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "def calculate_jaccard_index(df1, df2, epsilon=0.02):\n",
    "    df1 = df1.copy()\n",
    "    df2 = df2.copy()\n",
    "\n",
    "    common_columns = df1.columns.intersection(df2.columns).tolist()\n",
    "    compare_columns = common_columns\n",
    "\n",
    "    unmatched_df1 = []\n",
    "    unmatched_df2 = []\n",
    "\n",
    "    df1_matched = pd.Series([False] * len(df1))\n",
    "    df2_matched = pd.Series([False] * len(df2))\n",
    "\n",
    "    jaccard_indexes = {}\n",
    "\n",
    "    for article in list(set(df1[\"article_id\"])):\n",
    "        df1_rows = df1[df1[\"article_id\"] == article]\n",
    "        df2_rows = df2[df2[\"article_id\"] == article]\n",
    "\n",
    "        if len(df1_rows) > 0 and len(df2_rows) > 0:\n",
    "            jaccard_index_article = {}\n",
    "            for col in compare_columns:\n",
    "                df1_list = list(set(df1_rows[col]))\n",
    "                df2_list = list(set(df2_rows[col]))\n",
    "                if pd.api.types.is_numeric_dtype(df1[col]):\n",
    "                    jaccard_index_article[col], _, _, _ = jaccard_index_with_floats(\n",
    "                        df1_list, df2_list, epsilon\n",
    "                    )\n",
    "                else:\n",
    "                    jaccard_index_article[col], _, _, _ = jaccard_index_with_strings(\n",
    "                        df1_list, df2_list\n",
    "                    )\n",
    "            for idx in df2_rows.index:\n",
    "                df2_matched[idx] = True\n",
    "            for idx in df1_rows.index:\n",
    "                df1_matched[idx] = True\n",
    "            jaccard_indexes[article] = jaccard_index_article\n",
    "\n",
    "    unmatched_df1 = df1[~df1_matched]\n",
    "    unmatched_df2 = df2[~df2_matched]\n",
    "\n",
    "    unmatched_df1 = pd.DataFrame(unmatched_df1)\n",
    "    unmatched_df2 = pd.DataFrame(unmatched_df2)\n",
    "\n",
    "    return jaccard_indexes, unmatched_df1, unmatched_df2, common_columns\n",
    "\n",
    "\n",
    "jaccard_indexes, unmatched_df1, unmatched_df2, common_columns = calculate_jaccard_index(\n",
    "    assistant_df, nanozymes_df\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "jaccard_indexes_list = list(jaccard_indexes.values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "mean_indexes = {}\n",
    "\n",
    "for col in common_columns:\n",
    "    mean_indexes[col] = 0\n",
    "\n",
    "for indexes in jaccard_indexes_list:\n",
    "    for col in common_columns:\n",
    "        mean_indexes[col] += indexes[col]\n",
    "\n",
    "for col in common_columns:\n",
    "    mean_indexes[col] = mean_indexes[col] / len(jaccard_indexes_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'formula': 0.535274060989903,\n",
       " 'activity': 0.6970899470899472,\n",
       " 'syngony': 0.5389440035273368,\n",
       " 'length': 0.44622071050642476,\n",
       " 'width': 0.45354728311077513,\n",
       " 'depth': 0.3908929621231209,\n",
       " 'surface': 0.6679413179413181,\n",
       " 'polymer': 0.6355379188712522,\n",
       " 'surfactants': 0.675573192239859,\n",
       " 'molar_mass': 0.42255605946082136,\n",
       " 'km': 0.5747270674317294,\n",
       " 'v_max': 0.5214783072291425,\n",
       " 'reaction_type': 0.6090828924162255,\n",
       " 'c_min': 0.44175485008818344,\n",
       " 'c_max': 0.2047650541698161,\n",
       " 'c_const': 0.3758818342151676,\n",
       " 'ccat': 0.5052910052910055,\n",
       " 'ph': 0.773809523809524,\n",
       " 'temperature': 0.6014109347442682,\n",
       " 'article_id': 1.0}"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_indexes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>formula</th>\n",
       "      <th>activity</th>\n",
       "      <th>syngony</th>\n",
       "      <th>length</th>\n",
       "      <th>width</th>\n",
       "      <th>depth</th>\n",
       "      <th>surface</th>\n",
       "      <th>polymer</th>\n",
       "      <th>surfactants</th>\n",
       "      <th>molar_mass</th>\n",
       "      <th>km</th>\n",
       "      <th>v_max</th>\n",
       "      <th>reaction_type</th>\n",
       "      <th>c_min</th>\n",
       "      <th>c_max</th>\n",
       "      <th>c_const</th>\n",
       "      <th>ccat</th>\n",
       "      <th>ph</th>\n",
       "      <th>temperature</th>\n",
       "      <th>experiment_title</th>\n",
       "      <th>assistant_answer</th>\n",
       "      <th>article_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Fe3O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00275</td>\n",
       "      <td>0.000014</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.02000</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 1: Fe₃O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CoFe2O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.2</td>\n",
       "      <td>2.2</td>\n",
       "      <td>2.2</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00521</td>\n",
       "      <td>0.000239</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.02000</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 2: CoFe₂O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CuFe2O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.3</td>\n",
       "      <td>2.3</td>\n",
       "      <td>2.3</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00118</td>\n",
       "      <td>0.000082</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.02000</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 3: CuFe₂O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>MnFe2O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.7</td>\n",
       "      <td>2.7</td>\n",
       "      <td>2.7</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00034</td>\n",
       "      <td>0.000119</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.02000</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 4: MnFe₂O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>NiFe2O4-DOPA</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.6</td>\n",
       "      <td>2.6</td>\n",
       "      <td>2.6</td>\n",
       "      <td>dopamine-capped</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00034</td>\n",
       "      <td>0.000040</td>\n",
       "      <td>H2O2 + ABTS</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>100.00</td>\n",
       "      <td>0.02000</td>\n",
       "      <td>4.6</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 5: NiFe₂O₄-DOPA Nanozymes</td>\n",
       "      <td>1361-6463.aa5bf6.md</td>\n",
       "      <td>1361-6463.AA5BF6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1415</th>\n",
       "      <td>Pt</td>\n",
       "      <td>laccase</td>\n",
       "      <td>0.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>DNA-stabilized</td>\n",
       "      <td>Oligonucleotide (T10)</td>\n",
       "      <td>0</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-1.00000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>2,4-DCP + 4-AAP</td>\n",
       "      <td>0.05</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.04</td>\n",
       "      <td>-1.00000</td>\n",
       "      <td>7.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>T10-Templated Pt Nanozyme</td>\n",
       "      <td>s10562-017-2106-5.md</td>\n",
       "      <td>S10562-017-2106-5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1428</th>\n",
       "      <td>Ru</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>6.2</td>\n",
       "      <td>6.2</td>\n",
       "      <td>1.8</td>\n",
       "      <td>naked</td>\n",
       "      <td>poly(vinylpyrrolidone)</td>\n",
       "      <td>L-ascorbic acid</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.06030</td>\n",
       "      <td>0.000134</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2000.00</td>\n",
       "      <td>0.00068</td>\n",
       "      <td>4.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>Ruthenium (Ru) Nanoframes: TMB + H2O2</td>\n",
       "      <td>s11434-016-1193-9.md</td>\n",
       "      <td>S11434-016-1193-9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1429</th>\n",
       "      <td>Ru</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>7.0</td>\n",
       "      <td>6.2</td>\n",
       "      <td>6.2</td>\n",
       "      <td>1.8</td>\n",
       "      <td>naked</td>\n",
       "      <td>poly(vinylpyrrolidone)</td>\n",
       "      <td>L-ascorbic acid</td>\n",
       "      <td>0.0</td>\n",
       "      <td>318.00000</td>\n",
       "      <td>0.000074</td>\n",
       "      <td>H2O2 + TMB</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.80</td>\n",
       "      <td>0.00068</td>\n",
       "      <td>4.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>Ruthenium (Ru) Nanoframes: H2O2 + TMB</td>\n",
       "      <td>s11434-016-1193-9.md</td>\n",
       "      <td>S11434-016-1193-9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1448</th>\n",
       "      <td>V2O5</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>3.0</td>\n",
       "      <td>500.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.73800</td>\n",
       "      <td>0.018500</td>\n",
       "      <td>TMB + H2O2</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>4.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 1: V2O5 Nanozymes with TMB as Subst...</td>\n",
       "      <td>s16040584.md</td>\n",
       "      <td>S16040584</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1449</th>\n",
       "      <td>V2O5</td>\n",
       "      <td>peroxidase</td>\n",
       "      <td>3.0</td>\n",
       "      <td>500.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>naked</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.23200</td>\n",
       "      <td>0.012900</td>\n",
       "      <td>H2O2 + TMB</td>\n",
       "      <td>0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.00</td>\n",
       "      <td>1.00000</td>\n",
       "      <td>4.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>Experiment 2: V2O5 Nanozymes with H2O2 as Subs...</td>\n",
       "      <td>s16040584.md</td>\n",
       "      <td>S16040584</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>75 rows × 22 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           formula    activity  syngony  length  width  depth  \\\n",
       "2       Fe3O4-DOPA  peroxidase      7.0     3.5    3.5    3.5   \n",
       "3     CoFe2O4-DOPA  peroxidase      7.0     2.2    2.2    2.2   \n",
       "4     CuFe2O4-DOPA  peroxidase      7.0     2.3    2.3    2.3   \n",
       "5     MnFe2O4-DOPA  peroxidase      7.0     2.7    2.7    2.7   \n",
       "6     NiFe2O4-DOPA  peroxidase      7.0     2.6    2.6    2.6   \n",
       "...            ...         ...      ...     ...    ...    ...   \n",
       "1415            Pt     laccase      0.0    -1.0   -1.0   -1.0   \n",
       "1428            Ru  peroxidase      7.0     6.2    6.2    1.8   \n",
       "1429            Ru  peroxidase      7.0     6.2    6.2    1.8   \n",
       "1448          V2O5  peroxidase      3.0   500.0    0.0    0.0   \n",
       "1449          V2O5  peroxidase      3.0   500.0    0.0    0.0   \n",
       "\n",
       "              surface                 polymer      surfactants  molar_mass  \\\n",
       "2     dopamine-capped                       0                0         0.0   \n",
       "3     dopamine-capped                       0                0         0.0   \n",
       "4     dopamine-capped                       0                0         0.0   \n",
       "5     dopamine-capped                       0                0         0.0   \n",
       "6     dopamine-capped                       0                0         0.0   \n",
       "...               ...                     ...              ...         ...   \n",
       "1415   DNA-stabilized   Oligonucleotide (T10)                0        -1.0   \n",
       "1428            naked  poly(vinylpyrrolidone)  L-ascorbic acid         0.0   \n",
       "1429            naked  poly(vinylpyrrolidone)  L-ascorbic acid         0.0   \n",
       "1448            naked                       0                0         0.0   \n",
       "1449            naked                       0                0         0.0   \n",
       "\n",
       "             km     v_max    reaction_type  c_min  c_max  c_const     ccat  \\\n",
       "2       0.00275  0.000014      H2O2 + ABTS   0.00    0.0   100.00  0.02000   \n",
       "3       0.00521  0.000239      H2O2 + ABTS   0.00    0.0   100.00  0.02000   \n",
       "4       0.00118  0.000082      H2O2 + ABTS   0.00    0.0   100.00  0.02000   \n",
       "5       0.00034  0.000119      H2O2 + ABTS   0.00    0.0   100.00  0.02000   \n",
       "6       0.00034  0.000040      H2O2 + ABTS   0.00    0.0   100.00  0.02000   \n",
       "...         ...       ...              ...    ...    ...      ...      ...   \n",
       "1415   -1.00000 -1.000000  2,4-DCP + 4-AAP   0.05    2.0     0.04 -1.00000   \n",
       "1428    0.06030  0.000134       TMB + H2O2   0.00    0.0  2000.00  0.00068   \n",
       "1429  318.00000  0.000074       H2O2 + TMB   0.00    0.0     0.80  0.00068   \n",
       "1448    0.73800  0.018500       TMB + H2O2   0.00    0.0     0.00  1.00000   \n",
       "1449    0.23200  0.012900       H2O2 + TMB   0.00    0.0     0.00  1.00000   \n",
       "\n",
       "       ph  temperature                                   experiment_title  \\\n",
       "2     4.6         25.0                 Experiment 1: Fe₃O₄-DOPA Nanozymes   \n",
       "3     4.6         25.0               Experiment 2: CoFe₂O₄-DOPA Nanozymes   \n",
       "4     4.6         25.0               Experiment 3: CuFe₂O₄-DOPA Nanozymes   \n",
       "5     4.6         25.0               Experiment 4: MnFe₂O₄-DOPA Nanozymes   \n",
       "6     4.6         25.0               Experiment 5: NiFe₂O₄-DOPA Nanozymes   \n",
       "...   ...          ...                                                ...   \n",
       "1415  7.0         25.0                          T10-Templated Pt Nanozyme   \n",
       "1428  4.0         22.0              Ruthenium (Ru) Nanoframes: TMB + H2O2   \n",
       "1429  4.0         22.0              Ruthenium (Ru) Nanoframes: H2O2 + TMB   \n",
       "1448  4.0         25.0  Experiment 1: V2O5 Nanozymes with TMB as Subst...   \n",
       "1449  4.0         25.0  Experiment 2: V2O5 Nanozymes with H2O2 as Subs...   \n",
       "\n",
       "          assistant_answer         article_id  \n",
       "2      1361-6463.aa5bf6.md   1361-6463.AA5BF6  \n",
       "3      1361-6463.aa5bf6.md   1361-6463.AA5BF6  \n",
       "4      1361-6463.aa5bf6.md   1361-6463.AA5BF6  \n",
       "5      1361-6463.aa5bf6.md   1361-6463.AA5BF6  \n",
       "6      1361-6463.aa5bf6.md   1361-6463.AA5BF6  \n",
       "...                    ...                ...  \n",
       "1415  s10562-017-2106-5.md  S10562-017-2106-5  \n",
       "1428  s11434-016-1193-9.md  S11434-016-1193-9  \n",
       "1429  s11434-016-1193-9.md  S11434-016-1193-9  \n",
       "1448          s16040584.md          S16040584  \n",
       "1449          s16040584.md          S16040584  \n",
       "\n",
       "[75 rows x 22 columns]"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unmatched_df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['D0CC04101G(1)',\n",
       " 'J.ACA.2015.04.052',\n",
       " 'S00604-017-2552-1',\n",
       " 'S10562-017-2106-5',\n",
       " 'C2AN35700C(1)',\n",
       " 'J.ACA.2012.11.056',\n",
       " 'J.BIOS.2016.10.082',\n",
       " 'S11434-016-1193-9',\n",
       " 'J.BIOMATERIALS.2010.09.040',\n",
       " 'J.JCIS.2018.12.093',\n",
       " 'ANIE.201105121',\n",
       " 'CHEMOSENSORS1000359',\n",
       " 'C1CC11943E',\n",
       " '1361-6463.AA5BF6',\n",
       " 'J.SNB.2014.12.052',\n",
       " 'J.SNB.2021.130266',\n",
       " '1361-6528.AADDC2',\n",
       " 'S16040584',\n",
       " 'S0039914021005683',\n",
       " 'J.SNB.2017.07.108',\n",
       " 'J.ACA.2016.10.013']"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(set(unmatched_df1[\"article_id\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['J.JCIS.2017.11.064 ',\n",
       " 'C7AY02459B',\n",
       " 'J.BIOS.2016.10.082 ',\n",
       " 'C1CC11943E ',\n",
       " 'S16040584 ',\n",
       " 'ANIE.201105121 ',\n",
       " 'S11434-016-1193-9  ',\n",
       " 'J.ACA.2012.11.056  ',\n",
       " 'C8TB01132J',\n",
       " 'C7TB02434G',\n",
       " 'J.SNB.2021.130266 ',\n",
       " 'S00604-017-2552-1 ',\n",
       " 'J.JCIS.2018.12.093 ',\n",
       " 'AADDC2',\n",
       " 'J.JIEC.2021.09.034',\n",
       " 'S10562-017-2106-5    ',\n",
       " 'J.ACA.2015.04.052 ',\n",
       " 'J.SNB.2014.12.052\\xa0    ',\n",
       " 'CHEMOSENSORS10090359',\n",
       " 'D0CC04101G ',\n",
       " 'C2AN35700C ',\n",
       " 'J.BIOMATERIALS.2010.09.040 ',\n",
       " 'J.SNB.2017.07.108  ',\n",
       " 'J.ACA.2016.10.013 ',\n",
       " 'C1JM14253D',\n",
       " 'AA5BF6']"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(set(unmatched_df2[\"article_id\"]))"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
