import pandas as pd
import numpy as np
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina

def morgan_fp(mol):
   morgan = AllChem.GetMorganGenerator(radius=2, fpSize=512)
   return morgan.GetFingerprint(mol)

def butina(fingerprints):
   matrix = []
   for i in range(len(fingerprints)):
      print(f"{i} / {len(fingerprints)}", end='\r')
      for j in range(i):
         similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
         matrix.append(1 - similarity)

   clusters = Butina.ClusterData(data=matrix, nPts=len(fingerprints), distThresh=0.5, isDistData=True)
   clusters = sorted(clusters, key=len, reverse=True)

   return clusters

def cluster(smiles):
   df = pd.DataFrame(
      data={
         "smiles": smiles
      }
   )

   PandasTools.AddMoleculeColumnToFrame(frame=df, smilesCol='smiles')

   df['morgan'] = df['ROMol'].apply(morgan_fp)

   indices = butina(df['morgan'])
   
   # for cluster
   molecules = []
   for index_cluster in indices:
      cluster = []
      for index in index_cluster:
         cluster.append(smiles[index])
      molecules.append(cluster)
   
   return molecules