


















import json
import os

import datasets

_CITATION = 

_DESCRIPTION = 

_HOMEPAGE = "https://nlp.cs.washington.edu/triviaqa/"

_LICENSE = "Apache License 2.0"

_URLS = "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz"


class Triviaqa(datasets.GeneratorBasedBuilder):
    

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name="triviaqa", version=VERSION, description="The TriviaQA dataset"),
    ]

    def _info(self):
        features = datasets.Features(
            {
                "question_id": datasets.Value("string"),
                "question_source": datasets.Value("string"),
                "question": datasets.Value("string"),
                "answer": {
                    "aliases": datasets.features.Sequence(
                        datasets.Value("string"),
                    ),
                    "value": datasets.Value("string"),
                },
                "search_results": datasets.features.Sequence(
                    {
                        "description": datasets.Value("string"),
                        "filename": datasets.Value("string"),
                        "rank": datasets.Value("int32"),
                        "title": datasets.Value("string"),
                        "url": datasets.Value("string"),
                        "search_context": datasets.Value("string"),
                    }
                ),
            }
        )
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        urls = _URLS
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "unfiltered-web-train.jsonl"),
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                
                gen_kwargs={
                    "filepath": os.path.join(data_dir, "unfiltered-web-dev.jsonl"),
                },
            ),
        ]

    
    def _generate_examples(self, filepath):
        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                search_results = []
                for search_result in data["SearchResults"]:
                    search_results.append(
                        {
                            "description": search_result["Description"] if "Description" in search_result else "",
                            "filename": search_result["Filename"] if "Filename" in search_result else "",
                            "rank": search_result["Rank"] if "Rank" in search_result else -1,
                            "title": search_result["Title"] if "Title" in search_result else "",
                            "url": search_result["Url"] if "Url" in search_result else "",
                            "search_context": search_result["SearchContext"]
                            if "SearchContext" in search_result
                            else "",
                        }
                    )
                yield key, {
                    "question_id": data["QuestionId"],
                    "question_source": data["QuestionSource"],
                    "question": data["Question"],
                    "answer": {
                        "aliases": data["Answer"]["Aliases"],
                        "value": data["Answer"]["Value"],
                    },
                    "search_results": search_results,
                }
