# load VAD Lexicon
# map words to word type (e.g., adjective, ...) and filter out words not matching any of our supported categories
# also only keep words with known antonym??

# load dataset
# iterate over all sentences
# check for occurrence of words in the lexicon (only single occurrence, do we care about multiple occurrences?)
# [ADJECTIVE], [NOUN], ..., maybe even save antonym if available

# What if target word and antonym have different lengths when tokenized??????
## -> if we filter by equal tokenized words, the dataset is different across different models!!!
from gradiend.data import read_bookcorpus

def filter():

    import spacy

    from gradiend.setups.emotion.data import read_vad

    nlp = spacy.load("en_core_web_sm")
    # python -m spacy download en_core_web_sm

    def get_replacement_group_id(word):
        try:
            doc = nlp(word)
        except Exception:
            pass

        if len(doc) != 1:
            #print(f"Skipping word '{word}' due to unexpected tokenization: {doc.text}")
            return None  # only process single words for now

        token = doc[0]
        pos = token.pos_

        # Basic group: start with POS
        group = [pos]

        if pos == "ADJ":
            # Optional: could include comparative/superlative
            degree = token.morph.get("Degree")
            if degree:
                group.append(f"Degree={degree[0]}")  # e.g., 'Pos', 'Cmp', 'Sup'

        elif pos == "NOUN":
            number = token.morph.get("Number")
            if number:
                group.append(f"Number={number[0]}")  # e.g., 'Sing', 'Plur'

            # You could also add 'Gender', 'Case' if needed for other languages

        elif pos == "VERB":
            tense = token.morph.get("Tense")
            number = token.morph.get("Number")
            person = token.morph.get("Person")
            verbform = token.morph.get("VerbForm")

            # Only include features if they exist
            if verbform:
                group.append(f"VerbForm={verbform[0]}")
            if tense:
                group.append(f"Tense={tense[0]}")
            if person:
                group.append(f"Person={person[0]}")
            if number:
                group.append(f"Number={number[0]}")

        else:
            return None  # Ignore other POS

        id = "|".join(group)
        #print(f"Word: {word}, ID: {id}")
        return id


    vad_data = read_vad()

    #x = [x for _, x in vad_data.iterrows() if not isinstance(x['term'], str)]
    #print(x)

    # Apply the mapping function
    vad_data["replacement_group"] = vad_data["term"].apply(get_replacement_group_id)

    # Filter out rows with None
    vad_data_filtered = vad_data[vad_data["replacement_group"].notnull()].reset_index(drop=True)

    # Save the filtered DataFrame to a CSV file
    output = 'data/emotion/vad.csv'
    vad_data_filtered.to_csv(output, index=False)

def read_vad_filtered():
    import pandas as pd
    return pd.read_csv('data/emotion/vad.csv', dtype={'term': str}, na_values=[], keep_default_na=False)

def determine_antonyms():
    vad_data = read_vad_filtered()

    print(vad_data.groupby('replacement_group').size().sort_values(ascending=False).head(20))

    # NOUN|Number=Sing                -> battery
    # ADJ|DEGREE=Pos                  -> good
    # VERB|VerbForm=Inf               -> love
    # VERB|VerbForm=Fin|Tense=Past    -> loved
    # VERB|VerbForm=Part|Tense=Pres   -> loving
    # NOUN|Number=Plur                -> batteries
    # VERB|VerbForm=Part|Tense=Past   -> loved
    # VERB|VerbForm=Fin|Tense=Pres    -> love
    # VERB|VerbForm=Fin|Tense=Pres|Person=3|Number=Sign -> loves
    # ADJ|Degree=Cmp                  -> better                             -> Problems with antonyms (e.g., bigger -> small (found as antonym); could be mapped to correct inflection, but this makes faults, like "expensiver")
    # ADJ|Degree=Sup                  -> best                               -> similar problems


    # for each group, print 20 elements
    for group, group_data in vad_data.groupby('replacement_group'):
        print(f"Group: {group}, Count: {len(group_data)}")
        print(group_data['term'].head(20).tolist())
        print('-' * 50)


    #
import pandas as pd
from tqdm import tqdm
import re

def create_templates():
    dataset = read_bookcorpus()
    words = set(w.lower() for w in read_vad_filtered()['term'])  # Set for fast lookup

    records = []

    for sentence in tqdm(dataset['text'], desc="Processing sentences"):
        # Tokenize words with punctuation handling (very simple tokenizer)
        tokens = re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE)
        tokens_lower = [t.lower() for t in tokens]

# todo check casing
        masked_words = set(tokens_lower) & words  # Check which target words are present

        for word in masked_words:
            mask_count = sum(1 for t in tokens_lower if t == word)
            template = ' '.join('[MASK]' if t.lower() == word else t for t in tokens)

            # Fix spacing around punctuation (optional)
            template = re.sub(r'\s+([.,!?;:])', r'\1', template)

            records.append({
                'original': sentence,
                'template': template,
                'masked_word': word,
                'num_masks': mask_count
            })

    df = pd.DataFrame.from_records(records)
    print(f"Created {len(df)} templates.")
    output = 'data/emotion/templates.csv'
    df.to_csv(output, index=False)
    return df

def apply_filtering():
    pass



#determine_antonyms()
#filter()
create_templates()