#!/usr/bin/env python3
"""
Quick Test Script for ICLR 2026 Supplementary Material
Runs a minimal experiment to verify the implementation works correctly.
"""

import json
import os
import sys
from datetime import datetime

# Test the core voting logic without API calls
def test_voting_methods():
    """Test the voting method implementations."""
    print("🧪 Testing voting method implementations...")
    
    # Mock data for testing
    answers = ["123", "456", "123", "789", "123"]  # Majority should be "123"
    entropies = [2.1, 1.5, 2.3, 1.8, 1.2]  # Lower entropy = higher confidence
    
    # Import the voting function from one of the experiment files
    try:
        sys.path.append('experiments/aime24')
        from sequential_template import apply_voting_methods
        
        results = apply_voting_methods(answers, entropies)
        
        print("✅ Voting methods test results:")
        for method, result in results.items():
            print(f"   {method}: {result}")
        
        # Verify simple majority works
        if results.get('simple_majority') == "123":
            print("✅ Simple majority voting: PASSED")
        else:
            print("❌ Simple majority voting: FAILED")
            return False
        
        # Verify entropy weighting exists
        if 'entropy_weighted' in results:
            print("✅ Entropy weighted voting: PASSED") 
        else:
            print("❌ Entropy weighted voting: FAILED")
            return False
            
        return True
        
    except Exception as e:
        print(f"❌ Error testing voting methods: {e}")
        return False

def test_data_loading():
    """Test that datasets can be loaded."""
    print("\n🧪 Testing dataset loading...")
    
    try:
        from datasets import load_dataset
        
        # Test AIME 2024 dataset
        print("   Loading AIME 2024 dataset...")
        dataset = load_dataset("Maxwell-Jia/AIME_2024")
        print(f"   ✅ AIME 2024: {len(dataset['train'])} problems")
        
        # Test GPQA dataset  
        print("   Loading GPQA Diamond dataset...")
        dataset = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
        print(f"   ✅ GPQA Diamond: {len(dataset['train'])} problems")
        
        return True
        
    except Exception as e:
        print(f"   ❌ Error loading datasets: {e}")
        print("   This may be due to network issues or dataset availability.")
        return False

def test_analysis_functions():
    """Test the analysis script functionality."""
    print("\n🧪 Testing analysis functions...")
    
    try:
        sys.path.append('analysis')
        from comprehensive_analysis import extract_json_metrics, parse_filename_info
        
        # Test filename parsing
        test_filename = "AIME24_Sequential_3steps_gpt-oss-120b_20240101_120000.json"
        info = parse_filename_info(test_filename)
        
        expected_keys = ['benchmark', 'strategy', 'chains', 'model']
        if all(key in info for key in expected_keys):
            print("✅ Filename parsing: PASSED")
        else:
            print("❌ Filename parsing: FAILED")
            return False
        
        # Test mock JSON parsing
        mock_json = {
            "experiment_summary": {
                "model": "test-model",
                "strategy": "sequential", 
                "total_questions_processed": 10,
                "total_tokens_used": {"total": 1000}
            },
            "final_accuracies_percent": {
                "simple_majority": "70.0%"
            }
        }
        
        # Create a temporary file to test JSON extraction
        with open("test_results.json", "w") as f:
            json.dump(mock_json, f)
        
        metrics = extract_json_metrics("test_results.json")
        os.remove("test_results.json")
        
        if 'accuracy' in metrics and metrics['accuracy'] == 70.0:
            print("✅ JSON metrics extraction: PASSED")
        else:
            print("❌ JSON metrics extraction: FAILED")
            return False
            
        return True
        
    except Exception as e:
        print(f"❌ Error testing analysis functions: {e}")
        return False

def main():
    """Run all tests."""
    print("🧪 ICLR 2026 Supplementary Material - Quick Test Suite")
    print("=" * 60)
    
    tests = [
        ("Voting Methods", test_voting_methods),
        ("Dataset Loading", test_data_loading),
        ("Analysis Functions", test_analysis_functions)
    ]
    
    passed = 0
    total = len(tests)
    
    for test_name, test_func in tests:
        print(f"\n📋 {test_name}:")
        print("-" * 40)
        if test_func():
            passed += 1
    
    print("\n" + "=" * 60)
    print(f"📊 Test Results: {passed}/{total} tests passed")
    
    if passed == total:
        print("🎉 All tests passed! The implementation is working correctly.")
        print("\n📝 To run actual experiments:")
        print("1. Set your OpenRouter API key in the experiment files")
        print("2. Run: python experiments/aime24/parallel_template.py")
    else:
        print("⚠️  Some tests failed. Please check the implementation.")
        print("   Note: Dataset loading failures may be due to network issues.")

if __name__ == "__main__":
    main()