#!/usr/bin/env python3
"""
Test script that mimics exactly how torchtitan loads the C4 dataset.
This will help identify if there are any environment differences.
"""

import os
import sys
from pathlib import Path

# Set up the same environment as the training script
os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "1200"
os.environ["REQUESTS_TIMEOUT"] = "1200" 
os.environ["HF_DATASETS_CACHE"] = "/data"
os.environ["HF_HOME"] = "/data"
os.environ["HF_DATASETS_OFFLINE"] = "0"
os.environ["TRANSFORMERS_OFFLINE"] = "0"

# Add the torchtitan path to sys.path (same as training)
sys.path.insert(0, str(Path(__file__).parent))

try:
    from torchtitan.datasets.hf_datasets import _load_c4_dataset
    print("✅ Successfully imported torchtitan dataset loader")
    
    # Test the exact same function that torchtitan uses
    print("🔄 Testing _load_c4_dataset function...")
    dataset = _load_c4_dataset("allenai/c4")
    print("✅ Dataset loaded successfully")
    
    # Test iteration (same as torchtitan does)
    print("🔄 Testing dataset iteration...")
    iterator = iter(dataset)
    for i in range(3):
        sample = next(iterator)
        print(f"  Sample {i+1}: text length = {len(sample['text'])}")
    
    print("🎉 SUCCESS! C4 dataset works exactly like in torchtitan!")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("This suggests a path or environment issue.")
    
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    print("This is the same error you're seeing in training.")
    
    # Print detailed error information
    import traceback
    print("\nFull traceback:")
    traceback.print_exc() 