




















import json
import os

import datasets

_CITATION = 

_DESCRIPTION = 

_HOMEPAGE = "https://aghie.github.io/head-qa/"

_LICENSE = "MIT License"

_URL = "https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t"

_DIRS = {"es": "HEAD", "en": "HEAD_EN"}


class HeadQA(datasets.GeneratorBasedBuilder):
    

    VERSION = datasets.Version("1.1.0")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="es", version=VERSION, description="Spanish HEAD dataset"),
        datasets.BuilderConfig(name="en", version=VERSION, description="English HEAD dataset"),
    ]

    DEFAULT_CONFIG_NAME = "es"

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {
                    "name": datasets.Value("string"),
                    "year": datasets.Value("string"),
                    "category": datasets.Value("string"),
                    "qid": datasets.Value("int32"),
                    "qtext": datasets.Value("string"),
                    "ra": datasets.Value("int32"),
                    "answers": [
                        {
                            "aid": datasets.Value("int32"),
                            "atext": datasets.Value("string"),
                        }
                    ],
                }
            ),
            supervised_keys=None,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        
        data_dir = dl_manager.download_and_extract(_URL)

        dir = _DIRS[self.config.name]
        data_lang_dir = os.path.join(data_dir, dir)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_lang_dir, f"train_{dir}.json"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_lang_dir, f"test_{dir}.json"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "data_dir": data_dir,
                    "filepath": os.path.join(data_lang_dir, f"dev_{dir}.json"),
                },
            ),
        ]

    def _generate_examples(self, data_dir, filepath):
        
        with open(filepath, encoding="utf-8") as f:
            head_qa = json.load(f)
            for exam_id, exam in enumerate(head_qa["exams"]):
                content = head_qa["exams"][exam]
                name = content["name"].strip()
                year = content["year"].strip()
                category = content["category"].strip()
                for question in content["data"]:
                    qid = int(question["qid"].strip())
                    qtext = question["qtext"].strip()
                    ra = int(question["ra"].strip())

                    aids = [answer["aid"] for answer in question["answers"]]
                    atexts = [answer["atext"].strip() for answer in question["answers"]]
                    answers = [{"aid": aid, "atext": atext} for aid, atext in zip(aids, atexts)]

                    id_ = f"{exam_id}_{qid}"
                    yield id_, {
                        "name": name,
                        "year": year,
                        "category": category,
                        "qid": qid,
                        "qtext": qtext,
                        "ra": ra,
                        "answers": answers,
                    }
