from datasets import load_dataset


def get_data_for_validation(
    config, tokenizer, max_length: int = 64, prepend_bos: bool = False
):
    dataset = load_dataset(
        "allenai/c4", data_files=["en/c4-validation.00000-of-00008.json.gz"]
    )
    dataset = dataset.remove_columns(["timestamp", "url"])["train"]
    full_texts = dataset[: config.num_examples]["text"]
    if not prepend_bos:
        full_texts = [
            tokenizer.decode(tokenizer.encode(t)[:max_length]) for t in full_texts
        ]
    else:
        full_texts = [tokenizer.decode()]
