import os
from datasets import load_dataset as datasets_load_dataset
import pandas as pd



def load_word_dataset(
    path,
    sample_num,
    most_relevant_k,
    random_state,
    filter_duplicates=True,
    column=None,
):
    dataset = pd.read_csv(path)
    if column is not None:
        dataset = dataset[column]
    
    for column in dataset.columns:
        
        if column not in ["synset_id", "definition"]:
            
            dataset[column] = dataset[column].apply(
                lambda x: eval(x) if isinstance(x, str) else x
            )

            
            dataset[column] = dataset[column].apply(process_array)

    
    filtered_indices = []
    language_columns = [
        col for col in dataset.columns if col not in ["synset_id", "definition"]
    ]

    if filter_duplicates:
        for idx, row in dataset.iterrows():
            
            all_words = []
            for lang_col in language_columns:
                if isinstance(row[lang_col], list):
                    all_words.extend(row[lang_col])

            
            if len(all_words) == len(set(all_words)):
                
                filtered_indices.append(idx)

        
        filtered_dataset = dataset.loc[filtered_indices]
    else:
        filtered_dataset = dataset

    
    if len(filtered_dataset) >= sample_num:
        sampled_dataset = filtered_dataset.sample(
            n=sample_num, random_state=random_state
        ).reset_index(drop=True)
    else:
        print(
            f"Warning: Only {len(filtered_dataset)} rows remain after filtering. Using all available rows."
        )
        sampled_dataset = filtered_dataset.reset_index(drop=True)

    
    if most_relevant_k > 0:
        for column in language_columns:
            sampled_dataset[column] = sampled_dataset[column].apply(
                lambda x: (
                    x[:most_relevant_k] if isinstance(x, list) and len(x) > 0 else x
                )
            )

    return sampled_dataset


"""def load_bible_dataset(list_of_lang, column, sample_num):
    
    language_sentences = {}
    
    for lang, column_name in zip(list_of_lang, column):
        if lang == "en":
            dataset = datasets_load_dataset("bible_para", "en-fr", trust_remote_code=True)
            sentences = [render_combined_sentences(i, "en", dataset) for i in range(sample_num)]
            language_sentences[column_name] = sentences
        else:
            langs = ["en", lang]
            langs.sort()  
            config_name = f"{langs[0]}-{langs[1]}"
            try:
                dataset = datasets_load_dataset("bible_para", config_name, trust_remote_code=True)
                sentences = [render_combined_sentences(i, lang, dataset) for i in range(sample_num)]
                language_sentences[column_name] = sentences
            except Exception as e:
                print(f"Error loading language pair {config_name}: {str(e)}")
                print(f"Skipping language {lang}")
    
    
    df = pd.DataFrame(language_sentences)
    
    return df"""


def load_bible_dataset(list_of_lang, column, sample_num):
    language_sentences = {}

    for lang, column_name in zip(list_of_lang, column):
        if lang == "en":
            dataset = datasets_load_dataset(
                "bible_para", lang1="en", lang2="fr", trust_remote_code=True
            )
            sentences = [
                [render_combined_sentences(i, "en", dataset)] for i in range(sample_num)
            ]
            language_sentences[column_name] = sentences
        else:
            langs = ["en", lang]
            langs.sort()
            dataset = datasets_load_dataset(
                "bible_para", lang1=langs[0], lang2=langs[1], trust_remote_code=True
            )
            sentences = [
                [render_combined_sentences(i, lang, dataset)] for i in range(sample_num)
            ]
            language_sentences[column_name] = sentences

    language_sentences = pd.DataFrame(language_sentences)

    return language_sentences


def process_array(arr):
    if not isinstance(arr, list):
        return arr

    result = []
    seen = set()  

    for item in arr:
        if not isinstance(item, str):
            if item not in seen:
                result.append(item)
                seen.add(item)
            continue

        
        item = item.replace("_", " ")

        
        if "|" in item:
            parts = item.split("|")
            for part in parts:
                part = part.strip()
                if part and part not in seen:
                    result.append(part)
                    seen.add(part)
        else:
            item = item.strip()
            if item and item not in seen:
                result.append(item)
                seen.add(item)  

    return result


def render_combined_sentences(index, lang, dataset):

    text = ""
    for i in range(index * 1, index * 1 + 1):
        text += dataset["train"][i]["translation"][lang]

    return text


def load_dataset(dataset_type, **kwargs):
    if dataset_type.lower() == "word":
        required_args = ["path"]
        for arg in required_args:
            if arg not in kwargs:
                raise ValueError(f"Missing required argument for word dataset: {arg}")

        return load_word_dataset(
            path=kwargs["path"],
            sample_num=kwargs.get("sample_num", 200),
            most_relevant_k=kwargs.get("most_relevant_k", 1),
            random_state=kwargs.get("random_state", 42),
            filter_duplicates=kwargs.get("filter_duplicates", True),
            column=kwargs.get("column", None),
        )

    elif dataset_type.lower() == "bible":
        required_args = ["list_of_lang", "column"]
        for arg in required_args:
            if arg not in kwargs:
                raise ValueError(f"Missing required argument for Bible dataset: {arg}")

        return load_bible_dataset(
            list_of_lang=kwargs["list_of_lang"],
            column=kwargs.get("column"),
            sample_num=kwargs.get("sample_num", 200),
        )

    else:
        raise ValueError(
            f"Unknown dataset type: {dataset_type}. Choose either 'word' or 'bible'."
        )
