#!/usr/bin/env python3
"""
Test script that only tests C4 dataset loading without model imports.
"""

import os
import sys
from pathlib import Path

# Set up the same environment as the training script
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "1200"
os.environ["REQUESTS_TIMEOUT"] = "1200" 
os.environ["HF_DATASETS_CACHE"] = "/data"
os.environ["HF_HOME"] = "/data"
os.environ["HF_DATASETS_OFFLINE"] = "0"
os.environ["TRANSFORMERS_OFFLINE"] = "0"

try:
    # Import only the dataset module directly
    sys.path.insert(0, str(Path(__file__).parent / "torchtitan" / "datasets"))
    from hf_datasets import _load_c4_dataset, _process_c4_text
    
    print("✅ Successfully imported dataset functions")
    
    # Test the exact same function that torchtitan uses
    print("🔄 Testing _load_c4_dataset function...")
    dataset = _load_c4_dataset("allenai/c4")
    print("✅ Dataset loaded successfully")
    
    # Test iteration (same as torchtitan does)
    print("🔄 Testing dataset iteration...")
    iterator = iter(dataset)
    for i in range(3):
        sample = next(iterator)
        processed_text = _process_c4_text(sample)
        print(f"  Sample {i+1}: text length = {len(processed_text)}")
    
    print("🎉 SUCCESS! C4 dataset works perfectly!")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("This suggests a path issue.")
    
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    print("This is the actual dataset error.")
    
    # Print detailed error information
    import traceback
    print("\nFull traceback:")
    traceback.print_exc() 