import sys
import os
import pandas as pd
from rdkit.Chem import Draw
from rdkit import Chem
from rdkit.Chem import RDConfig
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem.Descriptors import ExactMolWt as MolWt

sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))

import sascorer
import numpy as np
import pubchempy as pcp

output_folder = "molecules_images"
os.makedirs(output_folder, exist_ok=True)

def save_smiles_image(smiles, filename, image_size=(400,400), dpi=300):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        image = Draw.MolToImage(mol, size=image_size)
        image.save(filename, dpi=(dpi, dpi))

fname_list = ["{model}_base_generated_smiles_data_{variant}.csv", "cbm_{variant}.csv"]
variants = ["ood_qed_high_logp_high", "ood_qed_high_logp_low", "ood_qed_low_logp_high", "ood_qed_low_logp_low"]
model_list = ["chemlactica-125m", "chemlactica-1.3b", "chemma-2b"]
datasets = ["zinc", "moses", "chembl"]

for model_name in model_list:
    for variant in variants:
        for fstr in fname_list:
            fname = fstr.format(variant=variant, model=model_name)
            try:
                df = pd.read_csv(f"ood_multiple_properties/{model_name}/{fname}")
            except:
                continue
            print(model_name + ": " + fname)
            scaffolds = set()
            counter = 0
            mol_wt_list = []
            length_list = []
            success_request = 0
            score_list = []
            already_exist = 0
            for index, row in df[df['valid_smiles']==True].iterrows():
                if (counter+1) %1000 == 0:
                    print(f"{counter} molecules done")
                smiles_string = row['smiles_string']
                mol = Chem.MolFromSmiles(smiles_string)
                mol_wt_list.append(MolWt(mol))
                length_list.append(len(smiles_string))
                if mol:
                    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
                    scaffold_smiles = Chem.MolToSmiles(scaffold, isomericSmiles=False)
                    scaffolds.add(scaffold_smiles)
                if counter < 5:
                    image_filename = os.path.join(output_folder, f"molecule_{index}.pdf")
                    # Save the SMILES image
                    save_smiles_image(smiles_string, image_filename)
                counter += 1
                try:
                    mol = Chem.MolFromSmiles(smiles_string)
                    score_list.append(sascorer.calculateScore(mol))
                    exist_out = pcp.get_compounds(smiles_string, 'smiles')
                    if bool(exist_out[0].record['id']):
                        already_exist += 1
                    success_request += 1
                except:
                    continue

            with open("length_and_wt_results.txt", "a") as f:
                f.write(model_name + ": " + fname + "\n")
                f.write(f"Molecular Weight - Min: {round(np.min(mol_wt_list),3)}     Max: {round(np.max(mol_wt_list),3)}      Avg: {round(np.mean(mol_wt_list),3)}\n")
                f.write(f"Length - Min: {round(np.min(length_list),3)}     Max: {round(np.max(length_list),3)}      Avg: {round(np.mean(length_list),3)}\n")
                f.write(f"Unique Scaffolds: {len(scaffolds)}\n")
                f.write(f"Molecules in CSV: {len(df)}\n")
                f.write(f"Valid SMILES: {counter}\n")
                f.write(f"Successful requests to PubChem: {success_request}\n")
                f.write(f"Number of molecules existing in PubChem: {already_exist} (~{(already_exist/success_request)*100}%)\n")

                f.write(f"Average synthesizability score: {round(np.mean(score_list), 7)}\n")
                f.write(f"Min synth score: {np.min(score_list)} and max synth score: {np.max(score_list)}\n")
                f.write("-------------------------------------------------\n")
