import os
from typing import cast

from datasets import Dataset, load_dataset
from dotenv import load_dotenv

from custom_colbert.dataset.hf_dataset_names import TrainDatasets

load_dotenv(override=True)

if os.getenv("HT_TOKEN") is None:
    raise ValueError("HT_TOKEN is not set. Please set it to your Hugging Face API token in `.env`.")

EXPECTED_FEATURES = ["query", "image", "image_filename", "answer", "page", "model", "prompt", "source"]


def test_is_synthetic_doc_qa_has_required_features():
    for ds in TrainDatasets.get_synthetic_datasets():
        dataset = cast(Dataset, load_dataset(ds.value, split="train", streaming=True))
        missing_features = set(EXPECTED_FEATURES) - set(dataset.features)
        assert not missing_features, f"Missing features in {ds}: {missing_features}"
