"""Module to store constants for The Pile dataset."""

from repo.dataset.constants.dataset_constants_types import (
    TRAIN_CONSTANT,
    TRAIN_SMALL_CONSTANT,
    VAL_CONSTANT,
    VAL_SMALL_CONSTANT,
    VAL_XSMALL_CONSTANT,
    VALIDATION_CONSTANT,
    DatasetConstants,
    DataSplitConstants,
)

THE_PILE_CLIENT_MAP = {
    "wikipedia": 0,
    "arxiv": 1,
    "gutenberg": 2,
    "hackernews": 3,
    "pubmedcentral": 4,
    "freelaw": 5,
    "philpapers": 6,
    "dmmathematics": 7,
    "enronemails": 8,
    "europarl": 9,
    "nihexporter": 10,
    "github": 11,
    "pilecc": 12,
    "pubmedabstract": 13,
    "stackexchange": 14,
    "usptobackgrounds": 15,
}


THE_PILE_CLIENT_NAMING_MAP = {
    "wikipedia": "Wikipedia (en)",
    "arxiv": "ArXiv",
    "gutenberg": "Gutenberg (PG-19)",
    "hackernews": "HackerNews",
    "pubmedcentral": "PubMed Central",
    "freelaw": "FreeLaw",
    "philpapers": "PhilPapers",
    "dmmathematics": "DM Mathematics",
    "enronemails": "Enron Emails",
    "europarl": "EuroParl",
    "nihexporter": "NIH ExPorter",
    "github": "Github",
    "pilecc": "Pile-CC",
    "pubmedabstract": "PubMed Abstracts",
    "stackexchange": "StackExchange",
    "usptobackgrounds": "USPTO Backgrounds",
}


# ------------------- The Pile ------------------- #


pile_constants = DatasetConstants(
    splits={},
)
pile_constants.splits[TRAIN_CONSTANT] = DataSplitConstants(
    path="",
    name="",
    split=TRAIN_CONSTANT,
    folder_split=TRAIN_CONSTANT,
    truncated_samples=None,
)
pile_constants.splits[TRAIN_SMALL_CONSTANT] = DataSplitConstants(
    path="",
    name="",
    split=TRAIN_CONSTANT,
    folder_split=TRAIN_SMALL_CONSTANT,
    truncated_samples=100000,
)
pile_constants.splits[VALIDATION_CONSTANT] = DataSplitConstants(
    path="",
    name="",
    split=VALIDATION_CONSTANT,
    folder_split=VAL_CONSTANT,
    truncated_samples=None,
)
pile_constants.splits[VAL_CONSTANT] = DataSplitConstants(
    path="",
    name="",
    split=VALIDATION_CONSTANT,
    folder_split=VAL_CONSTANT,
    truncated_samples=None,
)
pile_constants.splits[VAL_SMALL_CONSTANT] = DataSplitConstants(
    path="",
    name="",
    split=VALIDATION_CONSTANT,
    folder_split=VAL_SMALL_CONSTANT,
    truncated_samples=10000,
)
pile_constants.splits[VAL_XSMALL_CONSTANT] = DataSplitConstants(
    path="",
    name="",
    split=VALIDATION_CONSTANT,
    folder_split=VAL_XSMALL_CONSTANT,
    truncated_samples=3000,
)
