import imp

import argparse
from rdkit import Chem
# suppress rdkit warning
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

import pandas as pd



ATOM_LIST = ['C', 'H', 'O', 'N', 'F', 'S', 'Cl', 'P', 'B', 'Br', 'I']

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Preprocess the Data')
    parser.add_argument('--input', type=str, default = '',
                        help='path to input data')
    parser.add_argument('--output', type=str, default = '',
                        help='path to output data')
    args = parser.parse_args()

    df = pd.read_csv(args.input)
    print("Load {} data from {}".format(len(df), args.input))
    # print(df.describe())
    # print(df.columns)

    df['Remove'] = False
    for idx, row in df.iterrows():
        smiles = row['SMILES']
        mol = Chem.MolFromSmiles(smiles)
        mol = Chem.AddHs(mol) 
        if mol is None:
            # print('Empty molecule')
            df.loc[idx, 'Remove'] = True
            continue

        mol_block = Chem.MolToMolBlock(mol).split("\n")
        mol_block_length = sum([1 for d in mol_block if len(d)==69 and len(d.split())==16])
        if mol_block_length < mol.GetNumAtoms(): 
            # print(mol_block_length, '<', mol.GetNumAtoms())
            df.loc[idx, 'Remove'] = True
            continue
        if mol.GetNumAtoms() > 300: # --num_atoms 120
            # print('Too many atoms')
            df.loc[idx, 'Remove'] = True
            continue

        for atom in mol.GetAtoms():
            if atom.GetSymbol() not in ATOM_LIST:
                df.loc[idx, 'Remove'] = True
                break
    df = df[df['Remove'] == False]
    print("After filter, we get {} data".format(len(df)))

    df.to_csv(args.output)
    print("Save it to {}".format(args.output))

