from collections import defaultdict
import csv
import math
from rdkit import Chem
from similarity_clustering import cluster

targets = []
with open("./targets.txt", 'r') as file:
    for line in file:
        targets.append(line.strip())
        
#parse all corresponding ligands from BindingDB
ligands = defaultdict(lambda: defaultdict(float))
with open("./BindingDB_All.tsv", 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    next(reader)
    for idx, row in enumerate(reader):
        for target in targets:
            if target in row:
                affin = None
                if row[8]: affin = row[8]
                if row[9]: affin = row[9]
                if row[10]: affin = row[10]
                if row[11]: affin = row[11]
                if affin is None:
                    continue
                affin = affin.replace("<", "")
                affin = affin.replace(">", "")
                affin = float(affin)
                try:
                    affin = 298*0.001987*math.log(affin*math.pow(10, -9))
                except Exception as e:
                    continue
                
                ligand = row[1]
                try:
                    mol = Chem.MolFromSmiles(ligand)
                    if mol is not None:
                        ligands[target][Chem.MolToSmiles(mol)] = affin
                except:
                    continue
            if idx%10000 == 0: print(f"{idx} / 3,000,000", end='\r')
            
#cluster by similarity
clusters = defaultdict(list)
for idx, protein in enumerate(list(ligands.keys())):
    print(f"{protein} {idx} / {len(list(ligands.keys()))-1}")
    curr_ligands = list(ligands[protein].keys())
    new_cluster = cluster(curr_ligands)
    clusters[protein] = new_cluster

for target in clusters.keys():
    sample = []
    for cluster in clusters[target]:
        min_affin = 0
        min_ligand = ""
        for ligand in cluster:
            if ligands[target][ligand] < min_affin:
                min_affin = ligands[target][ligand]
                min_ligand = ligand
        sample.append(min_ligand)
    sample = sorted(sample, key=ligands[target].get)
    with open(f"./samples/{target}.txt", 'w') as file:
        for ligand in sample:
            file.write(ligand+"\n")
    