#!/usr/bin/env python
# coding: utf-8

# In[4]:


from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from tqdm import tqdm


# In[5]:


input_file = "C:/Users/LHR/0code/NC2025/data/pretrainedsmiles.txt"
output_file = "pretrainedsmiles_properties.txt"


# In[6]:


# SMARTS 模式：氨基（-NH2）
amino_smarts = "[NX3;H2]"
amino_pattern = Chem.MolFromSmarts(amino_smarts)

with open(input_file, "r") as fin, open(output_file, "w") as fout:
    for line in tqdm(fin):
        smi = line.strip()
        if not smi:
            continue

        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            fout.write("-1\t-1\t-1\t-1\n")
            continue

        # 1. 分子量
        mol_weight = Descriptors.MolWt(mol)

        # 2. 环数量
        ring_info = mol.GetRingInfo()
        num_rings = ring_info.NumRings()

        # 3. 最大环元数
        if num_rings > 0:
            ring_sizes = [len(r) for r in ring_info.AtomRings()]
            max_ring_size = max(ring_sizes)
        else:
            max_ring_size = 0

        # 4. 氨基数量（-NH2）
        num_amino = len(mol.GetSubstructMatches(amino_pattern))

        # 写入结果
        fout.write(f"{mol_weight:.2f}\t{num_rings}\t{max_ring_size}\t{num_amino}\n")


# In[ ]:




