from datasets import load_dataset, Dataset
import os
import numpy as np

def process_summeval_dataset():
    dataset = load_dataset("mteb/summeval")
    test_split = dataset["test"]

    new_examples = []
    for example in test_split:
        text = example["text"]
        machine_summaries = example["machine_summaries"]
        relevance_scores = example["relevance"]
        coherence_scores = example["coherence"]
        fluency_scores = example["fluency"]
        consistency_scores = example["consistency"]

        num_summaries = len(machine_summaries)
        for i in range(num_summaries):
            overall_score = np.mean([
                relevance_scores[i],
                coherence_scores[i],
                fluency_scores[i],
                consistency_scores[i]
            ])

            new_examples.append({
                "text": text,
                "machine_summary": machine_summaries[i],
                "relevance": relevance_scores[i],
                "coherence": coherence_scores[i],
                "fluency": fluency_scores[i],
                "consistency": consistency_scores[i],
                "overall_score": overall_score
            })

    new_dataset = Dataset.from_list(new_examples)
    output_dir = "dataset/"
    os.makedirs(output_dir, exist_ok=True)
    new_dataset.to_json(os.path.join(output_dir, "processed_summeval.json"))

if __name__ == "__main__":
    process_summeval_dataset()
