{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "o34OPWm-yDjN",
        "outputId": "7ba0e1a4-a37d-4d9a-d538-4b9eb6986889"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting rdkit-pypi\n",
            "  Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from rdkit-pypi) (1.26.4)\n",
            "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from rdkit-pypi) (11.0.0)\n",
            "Downloading rdkit_pypi-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m29.4/29.4 MB\u001b[0m \u001b[31m61.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: rdkit-pypi\n",
            "Successfully installed rdkit-pypi-2022.9.5\n"
          ]
        }
      ],
      "source": [
        "!pip install rdkit-pypi"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "from rdkit import Chem\n",
        "from rdkit.Chem import BRICS\n",
        "import re\n",
        "import time\n",
        "from rdkit import RDLogger\n",
        "\n",
        "# Suppress RDKit warnings\n",
        "RDLogger.DisableLog('rdApp.*')\n",
        "\n",
        "def clean_fragment_smiles(smiles):\n",
        "    \"\"\"Clean up BRICS fragment SMILES by removing parts like '[14*]' and any dangling parentheses.\"\"\"\n",
        "    cleaned_smiles = re.sub(r'\\[\\d+\\*\\]', '', smiles)\n",
        "    cleaned_smiles = re.sub(r'\\(\\)', '', cleaned_smiles)\n",
        "    return cleaned_smiles\n",
        "\n",
        "def get_atom_features(atom):\n",
        "    \"\"\"Extract features from a single atom and return them as a dictionary.\"\"\"\n",
        "    return {\n",
        "        'AtomicNum': atom.GetAtomicNum(),\n",
        "        'Hybridization': int(atom.GetHybridization()),\n",
        "        'Valence': atom.GetTotalValence(),\n",
        "        'FormalCharge': atom.GetFormalCharge(),\n",
        "        'Degree': atom.GetDegree(),\n",
        "        'IsInRing': int(atom.IsInRing())\n",
        "    }\n",
        "\n",
        "def find_ring_fragments_and_neighbors(original_mol, brics_fragments):\n",
        "    \"\"\"Find ring fragments and their neighboring atoms, including additional chemical features.\"\"\"\n",
        "    ring_fragment_neighbors = []\n",
        "    for frag in brics_fragments:\n",
        "        if frag.GetRingInfo().NumRings() > 0:\n",
        "            query_mol = Chem.DeleteSubstructs(Chem.RWMol(frag), Chem.MolFromSmarts('[#0]'))\n",
        "            try:\n",
        "                Chem.SanitizeMol(query_mol)\n",
        "            except:\n",
        "                continue\n",
        "            matches = original_mol.GetSubstructMatches(query_mol, uniquify=False)\n",
        "\n",
        "            for match in matches:\n",
        "                match_set = set(match)\n",
        "                for idx in match:\n",
        "                    atom = original_mol.GetAtomWithIdx(idx)\n",
        "                    for neighbor in atom.GetNeighbors():\n",
        "                        n_idx = neighbor.GetIdx()\n",
        "                        if n_idx not in match_set:\n",
        "                            frag_smiles = Chem.MolToSmiles(frag, isomericSmiles=True)\n",
        "                            frag_smiles = clean_fragment_smiles(frag_smiles)\n",
        "                            neighbor_atom = original_mol.GetAtomWithIdx(n_idx)\n",
        "                            neighbor_features = get_atom_features(neighbor_atom)\n",
        "                            ring_fragment_neighbors.append({\n",
        "                                'Ring Fragment': frag_smiles,\n",
        "                                'Neighbor Atom': neighbor_atom.GetSymbol(),\n",
        "                                **neighbor_features\n",
        "                            })\n",
        "    return ring_fragment_neighbors\n",
        "\n",
        "# Example usage\n",
        "df = pd.read_csv('0SelectedSMILES_QM9.txt')\n",
        "results = []\n",
        "start_time = time.time()\n",
        "for index, row in df.iterrows():\n",
        "    mol = Chem.MolFromSmiles(row['smiles'])\n",
        "    brics_fragments = list(BRICS.BRICSDecompose(mol, returnMols=True))\n",
        "    ring_neighbors_info = find_ring_fragments_and_neighbors(mol, brics_fragments)\n",
        "    results.extend(ring_neighbors_info)\n",
        "end_time = time.time()\n",
        "print(f'Time to extract substructures from QM9 is {end_time - start_time}')\n",
        "\n",
        "results_df = pd.DataFrame(results)\n",
        "results_df = results_df.drop_duplicates()\n",
        "results_df.to_csv('ring_fragments_neighbors_qm9_enriched.txt', index=False)\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1WzOAgDu1KPz",
        "outputId": "163fd3f9-d0ed-4dd4-a5ac-9c65d07d2786"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Time to extract substructures from QM9 is 191.86945819854736\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "q5g5SN6FyoDR"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}