import os
import evaluate
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

# Constants taken directly from experiments/lora/lora_finetune.py
MODEL_ID = "Qwen/Qwen3-0.6B"
TRAIN_DATASET = "flytech/python-codes-25k"
EVAL_DATASET_NAME = "wiki40b"
EVAL_DATASET_CONFIG = "fr"


def main():
    # Print current cache location to ensure it matches your Slurm script
    print(
        f"Using HF_HOME: {os.environ.get('HF_HOME', 'Default (check ~/.cache/huggingface)')}")

    print(f"--- Downloading Model & Tokenizer: {MODEL_ID} ---")
    # This downloads config, weights, and tokenizer files to the cache
    AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    AutoModelForCausalLM.from_pretrained(MODEL_ID, trust_remote_code=True)

    print(f"--- Downloading Training Dataset: {TRAIN_DATASET} ---")
    load_dataset(TRAIN_DATASET)

    print(f"--- Downloading Evaluation Dataset: {EVAL_DATASET_NAME} ---")
    load_dataset(EVAL_DATASET_NAME, EVAL_DATASET_CONFIG)

    print("--- Downloading Metrics ---")
    # The 'perplexity' metric loads a small python script from HF Spaces
    evaluate.load("perplexity")

    print("\nSuccess! All assets are cached.")


if __name__ == "__main__":
    main()
