


















import json
import os

import datasets

_CITATION = 

_DESCRIPTION = 

_HOMEPAGE = "https://allenai.org/data/drop"


_LICENSE = ""

_URLS = {
    "drop": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip",
}

_EMPTY_VALIDATED_ANSWER = [
    {
        "number": "",
        "date": {
            "day": "",
            "month": "",
            "year": "",
        },
        "spans": [],
        "worker_id": "",
        "hit_id": "",
    }
]


class Drop(datasets.GeneratorBasedBuilder):
    

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="drop", version=VERSION, description="The DROP dataset."),
    ]

    def _info(self):
        features = datasets.Features(
            {
                "section_id": datasets.Value("string"),
                "passage": datasets.Value("string"),
                "question": datasets.Value("string"),
                "query_id": datasets.Value("string"),
                "answer": {
                    "number": datasets.Value("string"),
                    "date": {
                        "day": datasets.Value("string"),
                        "month": datasets.Value("string"),
                        "year": datasets.Value("string"),
                    },
                    "spans": datasets.features.Sequence(datasets.Value("string")),
                    "worker_id": datasets.Value("string"),
                    "hit_id": datasets.Value("string"),
                },
                "validated_answers": datasets.features.Sequence(
                    {
                        "number": datasets.Value("string"),
                        "date": {
                            "day": datasets.Value("string"),
                            "month": datasets.Value("string"),
                            "year": datasets.Value("string"),
                        },
                        "spans": datasets.features.Sequence(datasets.Value("string")),
                        "worker_id": datasets.Value("string"),
                        "hit_id": datasets.Value("string"),
                    }
                ),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        urls = _URLS[self.config.name]
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "drop_dataset", "drop_dataset_train.json"),
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "drop_dataset", "drop_dataset_dev.json"),
                    "split": "validation",
                },
            ),
        ]

    
    def _generate_examples(self, filepath, split):
        with open(filepath, encoding="utf-8") as f:
            data = json.load(f)
            key = 0
            for section_id, example in data.items():
                
                for qa in example["qa_pairs"]:
                    
                    answer = qa["answer"]
                    answer = {
                        "number": answer["number"],
                        "date": {
                            "day": answer["date"].get("day", ""),
                            "month": answer["date"].get("month", ""),
                            "year": answer["date"].get("year", ""),
                        },
                        "spans": answer["spans"],
                        "worker_id": answer.get("worker_id", ""),
                        "hit_id": answer.get("hit_id", ""),
                    }
                    validated_answers = []
                    if "validated_answers" in qa:
                        for validated_answer in qa["validated_answers"]:
                            va = {
                                "number": validated_answer.get("number", ""),
                                "date": {
                                    "day": validated_answer["date"].get("day", ""),
                                    "month": validated_answer["date"].get("month", ""),
                                    "year": validated_answer["date"].get("year", ""),
                                },
                                "spans": validated_answer.get("spans", ""),
                                "worker_id": validated_answer.get("worker_id", ""),
                                "hit_id": validated_answer.get("hit_id", ""),
                            }
                            validated_answers.append(va)
                    else:
                        validated_answers = _EMPTY_VALIDATED_ANSWER
                    yield key, {
                        "section_id": section_id,
                        "passage": example["passage"],
                        "question": qa["question"],
                        "query_id": qa["query_id"],
                        "answer": answer,
                        "validated_answers": validated_answers,
                    }
                    key += 1
