import json
import pandas as pd
import os
from pathlib import Path

from typing import List, Tuple


def read_data(data_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, List[str]]:
    """Reads the annotations of the GoEmo dataset."""
    # Read the annotations from the local path
    train_data, valid_data, test_data \
        = pd.read_csv(os.path.join(data_folder, "train.tsv"), sep="\t", header=None), \
          pd.read_csv(os.path.join(data_folder, "dev.tsv"), sep="\t", header=None), \
          pd.read_csv(os.path.join(data_folder, "test.tsv"), sep="\t", header=None)

    # Read the list of emotions from the local path
    emotions = [emotion.strip("\n") for emotion in open(os.path.join(data_folder, "emotions.txt")).readlines()]

    return train_data, valid_data, test_data, emotions


def convert_to_unified(data, emotions, split):
    """Converts the data to the unified format."""
    # Define the prompt of the LLM
    instructions = \
        "Please comprehend emotions expressed from the given text. " + \
        "The set of emotions is as follows: " + str(emotions) + "."
    input_prefix = "Text: "
    output_prefix = "Answer: "

    # Construct the structure of the processed data
    unified_data = {
        "prompt": {
            "instructions": "Sentiment Classification: ",
            "input_prefix": input_prefix,
            "input_suffix": "\n",
            "output_prefix": output_prefix,
            "output_suffix": "\n"
        },
        "request_states": list()
    }

    # Convert the original dataset into the unified format
    for row in data.index:
        # Obtain the name of the emotions
        emotion = [emotions[int(i)] for i in data.loc[row][1].split(",")]
        emotion = sorted(emotion)
        # Add the unified annotation to the processed data
        unified_data["request_states"].append({
            "instance": {
                "input": {
                    "text": data.loc[row][0]
                },
                "references": [{
                    "output": {
                        "text": " | ".join([f"(emotion; is; {emo})" for emo in emotion])
                    }
                }],
                "split": split,
                "id": data.loc[row][2]
            },
            "request": dict()
        })

    print(len(unified_data["request_states"]))
    return unified_data


if __name__ == "__main__":
    # Read the GoEmotion dataset from the local path
    train_data, valid_data, test_data, emotions = read_data("../../data/goemotions/data")

    # Convert the GoEmotion dataset to the unified format
    train_unified = convert_to_unified(train_data, emotions, "train")
    valid_unified = convert_to_unified(valid_data, emotions, "dev")
    test_unified = convert_to_unified(test_data, emotions, "test")

    output_dir = Path("../../unified_data/SC/goemo")
    output_dir.mkdir(parents=True, exist_ok=True)
    # Write the processed data back to the local path
    json.dump(train_unified, open(os.path.join(output_dir, "train.json"), "w"), indent=4)
    json.dump(valid_unified, open(os.path.join(output_dir, "dev.json"), "w"), indent=4)
    json.dump(test_unified, open(os.path.join(output_dir, "test.json"), "w"), indent=4)
