from dataclasses import dataclass, asdict
from datetime import datetime
import uuid
from typing import List, Union
import json


@dataclass
class DatasetReference:
    name: str
    sources: Union[str, List[str]]
    tokenized: bool
    num_tokens: int
    size: int
    dataset_url: str
    manifest_url: str
    scrub_commit_hash: str
    scrub_diff: str
    sampling_yaml: str

    uuid: str = uuid.uuid4().__str__()
    creation_date: datetime = datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
    tokenizer: str = "EleutherAI/gpt-neox-20b"
    data_key: str = "json.gz"
    note: str = ""
