'''
- gmb_formatter.py
- This file handles formatting the GMB corpus into various formats for training
'''

# External imports
import random
import pandas as pd

# Internal imports
from src.core.configuration.datagen_conf import *

'''
----------import_data----------
- Loads the GMB data from the specified CSV file for use in later operations
-----Inputs-----
- schema_name - The schema to base the data off of
- format - The data format to use (default is CoNLL-2003)
- folds - The number of folds to generate (defaults to 5)
- batch_size - The batch size to truncate to (defaults to 0, no truncation)
-----Output-----
- gmb_data - A collection of tokens, split by sentence (Array of sentences)
'''
def import_data():
    # Load the file into a dataframe using pandas and return it
    raw_data = pd.read_csv(GMB_LOC, encoding="unicode_escape")

    # Use a label map to redo the labels
    label_map = {   
        "B-per":"per",
        "I-per":"per",
        "B-eve":"eve",
        "I-eve":"eve",
        "B-org":"org",
        "I-org":"org",
        "B-art":"art",
        "I-art":"art",
        "B-nat":"nat",
        "I-nat":"nat",
        "B-gpe":"gpe",
        "I-gpe":"gpe",
        "B-geo":"geo",
        "I-geo":"geo",
        "B-tim":"tim",
        "I-tim":"tim",
        "O":"O"
    }
    raw_data['Tag'] = raw_data['Tag'].map(label_map)

    gmb_data = {"texts":[], "tags":[]}
    current_sentence = {"texts":[], "tags":[]}
    first_time = True
    # Reformat the data so it's just the word and tag, split into sentences
    for index, row in raw_data.iterrows():
        # If a new sentence is starting, push the old one
        if "Sentence" in str(row["Sentence #"]) and not first_time:
            gmb_data["texts"].append(current_sentence["texts"])
            gmb_data["tags"].append(current_sentence["tags"])
            current_sentence = {"texts":[], "tags":[]}
        elif first_time:
            first_time = False
        # Add the token to the sentence
        current_sentence["texts"].append(row["Word"])
        current_sentence["tags"].append(row["Tag"])

    # Return the processed data
    return gmb_data
    


'''
----------generate_splits----------
- Splits the loaded GMB data into train, test, and valid sets
-----Inputs-----
- gmb_data - The loaded GMB data, supplied by import_data()
-----Output-----
- N/A - The appropriate splits are written to files in src/data
'''
def generate_splits():
    pass