"""Module for constants related to the SmolLM corpus."""

from repo.dataset.constants.dataset_constants_types import (
    TRAIN_CONSTANT,
    DatasetConstants,
    DataSplitConstants,
)

DATASET_PATH = "HuggingFaceTB/smollm-corpus"
COSMOPEDIA_V2 = "cosmopedia-v2"
PYTHON_EDU = "python-edu"
FINEWEB_EDU_DEDUP = "fineweb-edu-dedup"
PYTHON_EDU_BUCKET_NAME = "softwareheritage"


smollm_corpus_cosmo_constants = DatasetConstants(
    splits={
        TRAIN_CONSTANT: DataSplitConstants(
            path=DATASET_PATH,
            name=COSMOPEDIA_V2,
            split=TRAIN_CONSTANT,
            folder_split=TRAIN_CONSTANT,
            truncated_samples=None,
        ),
    },
)


smollm_corpus_python_edu_constants = DatasetConstants(
    splits={
        TRAIN_CONSTANT: DataSplitConstants(
            path=DATASET_PATH,
            name=PYTHON_EDU,
            split=TRAIN_CONSTANT,
            folder_split=TRAIN_CONSTANT,
            truncated_samples=None,
        ),
    },
)


smollm_corpus_fineweb_edu_dedup_constants = DatasetConstants(
    splits={
        TRAIN_CONSTANT: DataSplitConstants(
            path=DATASET_PATH,
            name=FINEWEB_EDU_DEDUP,
            split=TRAIN_CONSTANT,
            folder_split=TRAIN_CONSTANT,
            truncated_samples=None,
        ),
    },
)
