import pandas as pd
import numpy as np
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina

def morgan_fp(mol):
   morgan = AllChem.GetMorganGenerator(radius=2, fpSize=512)
   return morgan.GetFingerprint(mol)

def butina(fingerprints):
   matrix = []
   for i in range(len(fingerprints)):
       for j in range(i):
           similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
           matrix.append(1 - similarity)

   clusters = Butina.ClusterData(data=matrix, nPts=len(fingerprints), distThresh=0.5, isDistData=True)
   clusters = sorted(clusters, key=len, reverse=True)

   return clusters

def cluster(smiles):
   df = pd.DataFrame(
      data={
         "smiles": smiles
      }
   )

   PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')

   df['morgan'] = df['ROMol'].apply(morgan_fp)

   indices = butina(df['morgan'])
   
   molecules = []
   for index_cluster in indices:
      molecules.append(smiles[min(index_cluster)])
      
   return molecules
   