import os
import json
import random
from datasets import load_dataset
import re
import pandas as pd

random.seed(42)

SAVE_DIR = "data"
os.makedirs(SAVE_DIR, exist_ok=True)


def save_json(data, path):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump([d for d in data], f, ensure_ascii=False, indent=2)


##FP
import random
filepath="./original_data/Sentences_AllAgree.txt"
def load_fp_data(filepath):
    data=[]
    with open(filepath, encoding="iso-8859-1") as f:
        for id_, line in enumerate(f):
            sentence, label = line.rsplit("@", 1)
            data.append({"sentence": sentence, "label": re.sub(r'\s+', '', label)})
    random.seed(42)
    test_data = random.sample(data, 250)
    train_data = [item for item in data if item not in test_data]
    return train_data, test_data

name='fp'
train_data, test_data=load_fp_data(filepath)
folder = os.path.join(SAVE_DIR, name)
save_json(test_data, os.path.join(folder, "test.json"))
sampled = random.sample(train_data, 500)
trainset = sampled[:400]
evalset = sampled[400:500]
save_json(trainset, os.path.join(folder, "train.json"))
save_json(evalset, os.path.join(folder, "val.json"))

##BBH
dataset_name='date'
trainpath = f"./original_data/{dataset_name}_train.json"
with open(trainpath, 'rb') as fh:
    datapool = json.load(fh)

sampled = random.sample(datapool, min(500, len(datapool)))
trainset = sampled[:int(0.8*len(sampled))]
evalset = sampled[int(0.8*len(sampled)):]
folder = os.path.join(SAVE_DIR, dataset_name)
save_json(trainset, os.path.join(folder, "train.json"))
save_json(evalset, os.path.join(folder, "val.json"))
testpath = f"./original_data/{dataset_name}_eval.json"
with open(testpath, 'rb') as fh:
    test_data = json.load(fh)

test_data = test_data['examples']
save_json(test_data, os.path.join(folder, "test.json"))



##GPQA
testdf = pd.read_csv("./original_data/gpqa_diamond.csv")
orgin_data = pd.read_csv("./original_data/gpqa_main.csv")
traindf = orgin_data[~orgin_data['Record ID'].isin(testdf['Record ID'])]
def load_gpqa_examples(question_df):
    letter_answer_choices = ['(A)', '(B)', '(C)', '(D)']
    random.seed(42)
    def shuffle_choices_and_create_example(row):
        list_choices = [row['Incorrect Answer 1'], row['Incorrect Answer 2'], row['Incorrect Answer 3'], row['Correct Answer']]
        random.shuffle(list_choices)
        prompt = f"{row.Question}"
        prompt += f"\n\nChoices:\n(A) {list_choices[0]}\n(B) {list_choices[1]}\n(C) {list_choices[2]}\n(D) {list_choices[3]}"
        example = {
            "input": prompt,
            "target": f"{letter_answer_choices[list_choices.index(row['Correct Answer'])]}"
        }
        return example
    return [shuffle_choices_and_create_example(row) for _, row in question_df.iterrows()]
trainvalset = load_gpqa_examples(traindf)
testset = load_gpqa_examples(testdf)

name='gpqa'
folder = os.path.join(SAVE_DIR, name)
save_json(testset, os.path.join(folder, "test.json"))
sampled = random.sample(trainvalset, min(500, len(trainvalset)))
trainset = sampled[:200]
evalset = sampled[200:]
save_json(trainset, os.path.join(folder, "train.json"))
save_json(evalset, os.path.join(folder, "val.json"))

##GSM8K
def read_jsonl(path: str):
    with open(path) as fh:
        return [json.loads(line) for line in fh.readlines() if line]

name='gsm8k'
folder = os.path.join(SAVE_DIR, name)
trainvalset = read_jsonl("./original_data/train.jsonl")
testset = read_jsonl("./original_data/test.jsonl")

sampled = random.sample(trainvalset, min(500, len(trainvalset)))
trainset = sampled[:400]
evalset = sampled[400:500]
save_json(trainset, os.path.join(folder, "train.json"))
save_json(evalset, os.path.join(folder, "val.json"))
sampled = random.sample(testset, min(250, len(testset)))
save_json(sampled, os.path.join(folder, "test.json"))
