import os
import torch

import numpy as np
import pandas as pd
from rdkit import Chem
import json
from sklearn.utils import shuffle
from rdkit.Chem import FragmentCatalog
from rdkit.Chem import RDConfig
from datasets import Dataset
from torch.utils.data import DataLoader


def load_functional_groups_from_csv(csv_file):
    """Load functional group information from a CSV file, returning a dictionary of group names and SMARTS patterns."""
    fg_df = pd.read_csv(csv_file, header=None, names=['SMARTS', 'GroupName'])
    functional_groups = {row['SMARTS']: row['GroupName'] for _, row in fg_df.iterrows()}
    return functional_groups


def find_functional_groups(smiles: str, functional_groups: dict):
    """Extract functional group information from a SMILES string."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES string: {smiles}")

    detected_groups = {}

    for group_smarts, group_name in functional_groups.items():
        patt = Chem.MolFromSmarts(group_smarts)
        if mol.HasSubstructMatch(patt):
            matches = mol.GetSubstructMatches(patt)
            detected_groups[group_name] = len(matches)

    return detected_groups



def extract_latents_of_percond_inputs(raw_df, save_json_path):

    file_path = 'functional_groups_smiles_codes.csv'
    func_groups = load_functional_groups_from_csv(file_path)

    results = []
    results_raw_prompt = []
    for i in range(len(raw_df)):
        prompt_dict = {}
        r_dict = {'instruction': '', 'input': '', 'output': "", 'history':[]}
        row = raw_df.iloc[i]
        objective = row['objective']

        reactant_1 = Chem.MolToSmiles(Chem.MolFromSmiles(row['Reactant1']))
        reactant_2 = Chem.MolToSmiles(Chem.MolFromSmiles(row['Reactant2']))
   
        catalyst = Chem.MolToSmiles(Chem.MolFromSmiles(row['Catalyst']))
        solvent = Chem.MolToSmiles(Chem.MolFromSmiles(row['Solvent']))

        product = Chem.MolToSmiles(Chem.MolFromSmiles(row['Product']))
 
        '''
        Here is a chemical reaction. Reactants are: ClC=1SC2=C(C=NC(=C2)Cl)N1,C1(CC1)B(O)O. 
        Product is: ClC1=CC2=C(C=N1)N=C(S2)C2CC2. Reaction type is Buchwald-Hartwig cross coupling.
        The reaction conditions of this reaction are: Solvent: O,C1(=CC=CC=C1)C. 
        Catalyst: C1=CC=C(C=C1)P(C1(C=CC=C1)[Fe]C1(P(C2=CC=CC=C2)C2=CC=CC=C2)C=CC=C1)C1=CC=CC=C1.Cl[Pd]Cl. 
        Atmosphere: N#N. 
        Additive: [Cs]OC(=O)O[Cs].
        Functional Group: Alkene([R]C([R])=C([R])[R]). Number: 2. Reactant:[ClC=1SC2=C(C=NC(=C2)Cl)N1]
        '''
        prompt_dict['reaction'] = f'Here is a chemical reaction. Reactants are: {reactant_1}, {reactant_2}. Product is: {product}.'
        prompt_dict['reaction_type'] = 'Reaction type is asymmetric nucleophilic addition.'
        prompt_dict['condition'] = f'The reaction conditions of this reaction are: Solvent: {solvent}. '
        prompt_dict['graph_knowledge'] = []

        # 将分子添加到目录中，这会自动检测并记录所有匹配的官能团
        smiles = [reactant_1, reactant_2, product, catalyst,solvent,]
        roles = ['Reactant', 'Reactant', "Product", "Catalyst", 'Solvent']
                 
   
        for smile, role in zip(smiles, roles):
            # detected_groups = find_functional_groups(smiles, functional_groups)
            func_group_info = get_func_group(func_groups, smile, role)
            prompt_dict['graph_knowledge'] = prompt_dict['graph_knowledge'] + func_group_info

        prompt_dict['graph_knowledge_changes'] = obtain_functional_group_changes(prompt_dict['graph_knowledge'])
        results_raw_prompt.append(prompt_dict)

        fg_graph = '. '.join(prompt_dict['graph_knowledge'])
        fg_new = '. '.join(prompt_dict['graph_knowledge_changes']['New Functional Groups'])
        fg_lost = '. '.join(prompt_dict['graph_knowledge_changes']['Lost Functional Groups'])
        fg_prompt = f"{fg_graph}. New Functional Groups: {fg_new}. Lost Functional Groups: {fg_lost}"


        instruct =   (f"{prompt_dict['reaction']} {prompt_dict['reaction_type']} {prompt_dict['condition']}"
                      f" Functional groups information: {fg_prompt}.")

        instruct = instruct + " What is the selectivity of this reaction?"

        r_dict['instruction'] = instruct
        # r_dict['output'] = str(int(yields))
        r_dict['output'] = objective

        results.append(r_dict)
    json.dump(results, open(save_json_path, 'w'))
    return results, results_raw_prompt




def obtain_functional_group_changes(graph_knowledge):
    reactant_functional_groups = [entry for entry in graph_knowledge if "Reactant" in entry]
    product_functional_groups = [entry for entry in graph_knowledge if "Product" in entry]
    # print(product_functional_groups)
    def extract_functional_group_details(entries):
        details = {}
        for entry in entries:
            # Check if the entry contains a functional group and count
            if "Functional Group:" in entry and "Count:" in entry:
                # Extract the functional group type
                fg_type = entry.split("Functional Group: ")[1].split(",")[0]
                # Extract the count
                count = int(entry.split("Count: ")[1])
                # Accumulate counts for each functional group type
                if fg_type in details:
                    details[fg_type] += count
                else:
                    details[fg_type] = count
        return details

    reactant_fg_details = extract_functional_group_details(reactant_functional_groups)
    product_fg_details = extract_functional_group_details(product_functional_groups)



    # Determine new and lost functional groups
    new_functional_groups = []
    lost_functional_groups = []

    for fg, count in product_fg_details.items():
        if fg in reactant_fg_details:
            if count > reactant_fg_details[fg]:
                new_functional_groups.append(f"Functional Group: {fg}. Increased by: {count - reactant_fg_details[fg]}")
        elif count > 0:
            new_functional_groups.append(f"Functional Group: {fg}. New: {count}")

    for fg, count in reactant_fg_details.items():
        if fg in product_fg_details:
            if count > product_fg_details[fg]:
                lost_functional_groups.append(f"Functional Group: {fg}. Decreased by: {count - product_fg_details[fg]}")
        elif count > 0:
            lost_functional_groups.append(f"Functional Group: {fg}. Lost: {count}")

    # Append new and lost functional groups to graph_knowledge
    graph_knowledge_changes = {"New Functional Groups": new_functional_groups if new_functional_groups else ["None"],
                                "Lost Functional Groups": lost_functional_groups if lost_functional_groups else ["None"]}

    return graph_knowledge_changes



def get_func_group(func_groups_df, smiles, Role):
    func_group_info = []
    func_group = find_functional_groups(smiles, func_groups_df)
    name_to_smarts = dict(zip(func_groups_df.values(), func_groups_df.keys()))
    # 打印出找到的官能团信息
    for func_group_name, number in func_group.items():
        func_group_smarts = name_to_smarts[func_group_name]
        func_group_info.append(f'{Role}: [{smiles}]. Functional Group: {func_group_name}({func_group_smarts}). Count: {number}')
    return func_group_info

from sklearn.cluster import KMeans

def select_representatives(df, n_clusters=10, samples_per_cluster=10, random_state=42):
    """
    将DataFrame中的分类变量转化为one-hot编码，并聚类为指定数量的类别。
    然后从每个类别中选出指定数量的代表样本，并返回这些样本的原始索引。

    参数:
    df (pd.DataFrame): 输入的数据框，包含需要转换和聚类的数据。
    n_clusters (int): 聚类的数量，默认是10。
    samples_per_cluster (int): 每个类别中选择的代表样本数量，默认是10。
    random_state (int): 随机状态种子，用于保证结果的可重复性。

    返回:
    list: 选出来的代表数据的原始索引列表。
    """

    # 检查是否有缺失值，如果有，可以考虑填充或删除。
    # 这里简单地跳过此步骤，假设数据已经清理好。

    # 将分类变量转换为one-hot编码
    df_encoded = pd.get_dummies(df)

    # 使用KMeans聚类
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    cluster_labels = kmeans.fit_predict(df_encoded)

    # 创建一个新的DataFrame来保存聚类标签
    df_with_clusters = df_encoded.copy()
    df_with_clusters['cluster'] = cluster_labels

    representatives_indices = []
    for cluster_id in range(n_clusters):
        # 获取属于当前类的所有数据的索引
        indices_in_cluster = df_with_clusters[df_with_clusters['cluster'] == cluster_id].index

        if len(indices_in_cluster) >= samples_per_cluster:
            # 如果该类的数据量大于等于所需样本数，则随机选取指定数量的样本作为代表
            selected_indices = pd.Series(indices_in_cluster).sample(n=samples_per_cluster,
                                                                    random_state=random_state).tolist()
        else:
            # 如果少于所需样本数，则全部选取
            selected_indices = indices_in_cluster.tolist()

        representatives_indices.extend(selected_indices)

    return representatives_indices


if __name__ == '__main__':
    # df = pd.read_excel('D:\\Workspace\\PhD_workspace\\rxn_yields\\data\\Suzuki-Miyaura\\aap9112_Data_File_S1.xlsx')
    df = pd.read_csv(r'./data/cpa_ti_add/Denmark_input_data_processed.csv')
    train_size = 100
    tag = 'cpa'
    select_method = 'random'
    if select_method == 'random':
        ids = shuffle(np.arange(len(df)), random_state=42)
        train_idx = ids[:train_size]
        test_idx = ids[train_size:]

    data_info_dict = {
        "train_idx": train_idx,
        "val_idx": test_idx
    }
    train_size = len(train_idx)

    if not os.path.exists(f'data/data4regression/{tag}_{train_size}'):
        os.mkdir(f'data/data4regression/{tag}_{train_size}')
    torch.save(data_info_dict, f'./data/data4regression/{tag}_{train_size}/split_idx.pt')


    results, raw_dict = extract_latents_of_percond_inputs(df, save_json_path='./data/buchwald/Buchwald_Hartwig_func.json')
    res_lens = [len(i['instruction']) for i in results]
    print('max lens', max(res_lens), '. mean lens', np.mean(res_lens))


    train_set = [results[i] for i in train_idx]
    test_set = [results[i] for i in test_idx]

    train_df = pd.DataFrame(train_set)
    test_df = pd.DataFrame(test_set)
    all_df = pd.DataFrame(results)

    all_df.to_csv(f'data/data4regression/{tag}_{train_size}/all.csv', index=False)
    train_df.to_csv(f'data/data4regression/{tag}_{train_size}/train.csv', index=False)
    test_df.to_csv(f'data/data4regression/{tag}_{train_size}/test.csv', index=False)
    # json.dump(train_set, open('./data/suzuki_miyaura_func_train_full5700.json', 'w'))
    # json.dump(test_set, open('./data/suzuki_miyaura_func_test_full5700.json', 'w'))


