#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os
from rdkit import Chem
from tqdm import tqdm


# In[2]:


# 读取smiles.txt并构建字典
def build_smiles_dict(smiles_file):
    smiles_dict = set()
    with open(smiles_file, 'r') as f:
        for line in f:
            smiles = line.strip()
            smiles_dict.update(smiles)
    return smiles_dict

# 使用rdkit对SMILES序列进行标准化
def standardize_smiles(smiles):
    try:
        # 将SMILES序列转换为分子对象
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            # 标准化分子（去除盐、标准化原子价等）
            mol = Chem.RemoveHs(mol)  # 去除氢原子
            standardized_smiles = Chem.MolToSmiles(mol, isomericSmiles=False, canonical=True)
            return standardized_smiles
        else:
            return None
    except:
        return None


# In[3]:


# 筛选符合要求的SMILES序列
def filter_smiles(zinc_folder, smiles_dict, output_file):
    with open(output_file, 'w') as out_f:
        num = 0
        for root, _, files in os.walk(zinc_folder):
            #os.walk递归地遍历指定文件夹（zinc_folder）及其所有子文件夹，并返回一个生成器（generator），
            #每次生成一个三元组 (root, dirs, files)，其中：
            #root：当前遍历的文件夹路径（字符串）
            #dirs：当前文件夹下的子文件夹列表（列表，每个元素是文件夹名称）
            #files：当前文件夹下的文件列表（列表，每个元素是文件名称）
            for file in tqdm(files):
                if file.endswith('.txt'):
                    file_path = os.path.join(root, file)
                    print("file_path:",file_path)
                    with open(file_path, 'r') as in_f:
                        for line in tqdm(in_f):
                            smiles = line.split()[0].strip()
                            # 标准化SMILES序列
                            standardized_smiles = standardize_smiles(smiles)
                            if standardized_smiles and 34 <= len(standardized_smiles) <= 74 and all(char in smiles_dict for char in standardized_smiles):
                                num+=1
                                out_f.write(standardized_smiles + '\n')
        print("pretrained data num:",num)


# In[4]:


smiles_file = 'smiles.txt'
zinc_folder = 'ZINC15'
output_file = 'pretrainedsmiles.txt'
smiles_dict = build_smiles_dict(smiles_file)
filter_smiles(zinc_folder, smiles_dict, output_file)


# In[ ]:




