import os
import json
from typing import Dict, List

def load_judge_programs(storage_dir: str = "synthesized_program_judges") -> Dict[str, Dict]:
    """
    Load all judge programs and their metadata from the storage directory.
    
    Args:
        storage_dir (str): Directory where judge programs and metadata are stored.
    
    Returns:
        Dict[str, Dict]: Dictionary of program IDs mapped to their metadata and code.
    """
    programs = {}
    metadata_file = os.path.join(storage_dir, 'programs_metadata.json')

    if not os.path.exists(metadata_file):
        print(f"Error: Metadata file {metadata_file} not found.")
        return programs

    try:
        with open(metadata_file, 'r') as f:
            metadata = json.load(f)

        for prog_id, meta in metadata.items():
            prog_file = os.path.join(storage_dir, f"{prog_id}.py")
            if os.path.exists(prog_file):
                with open(prog_file, 'r') as f:
                    programs[prog_id] = {
                        'code': f.read(),
                        'description': meta.get('description', 'No description'),
                        'function_name': meta.get('function_name', ''),
                        'criteria': meta.get('criteria', 'Unknown'),
                        'file_path': meta.get('file_path', prog_file)
                    }
            else:
                print(f"Warning: Program file {prog_file} for {prog_id} not found.")
        return programs

    except json.JSONDecodeError:
        print(f"Error: Metadata file {metadata_file} is corrupted or invalid.")
        return programs


def test_judge_program(program: Dict, query: str, response: str) -> Dict:
    """
    Test a single judge program with the given query and response.
    
    Args:
        program (Dict): Program metadata including code and function name.
        query (str): The input query to test.
        response (str): The response to evaluate.
    
    Returns:
        Dict: Result of the judge program or an error result.
    """
    try:
        namespace = {}
        exec(program['code'], namespace)
        judge_function = namespace.get(program['function_name'])

        if not judge_function:
            return {
                'score': 0.0,
                'reasoning': f"Function {program['function_name']} not found in program code.",
                'criteria': program['criteria'],
                'executable': False
            }

        result = judge_function(query, response)
        if not (isinstance(result, dict) and all(key in result for key in ['score', 'reasoning', 'criteria'])):
            return {
                'score': 0.0,
                'reasoning': "Invalid output format from judge function. Expected dict with keys: score, reasoning, criteria.",
                'criteria': program['criteria'],
                'executable': False
            }
            
        result['executable'] = True
        return result

    except Exception as e:
        return {
            'score': 0.0,
            'reasoning': f"Error executing program: {str(e)}",
            'criteria': program['criteria'],
            'executable': False
        }


def main():
    # Example query and response
    query = "What is the capital city of France, and what is its largest airport?"
    
    response = "The capital city of France is **Paris**, a globally renowned center of art, culture, history, and politics. Often referred to as the “City of Light,” Paris is not only the political and administrative heart of France but also one of the most visited cities in the world, attracting millions of tourists annually to landmarks such as the Eiffel Tower, the Louvre Museum, and the Champs-Élysées. The largest and busiest airport serving Paris—and France overall—is **Charles de Gaulle Airport (CDG)**, also known as Roissy Airport. Located about 25 kilometers northeast of central Paris, it opened in 1974 and has since grown into one of Europe’s primary aviation hubs. CDG handles the majority of France’s international air traffic, serving as the main base for Air France and connecting Europe with destinations worldwide. Its extensive infrastructure makes it a key gateway for travelers entering and leaving France."

    # Load all judge programs
    programs = load_judge_programs()

    if not programs:
        print("No judge programs found. Please generate programs first.")
        return

    results: List[Dict] = []

    # Test each program
    for prog_id, program in programs.items():
        print("------------------------------")
        print(f"\n Testing {prog_id}: {program['description']} (Criteria: {program['criteria']})")
        result = test_judge_program(program, query, response)

        print(f"   Executable: {'✅' if result['executable'] else '🚫'}")
        print(f"   Score: {result['score']}")

        results.append({
            'prog_id': prog_id,
            'score': result['score'],
            'criteria': result['criteria'],
            'executable': result['executable']
        })

    # Print summary table
    print("\n Summary Table:")
    print("-" * 70)
    print(f"{'Program ID':<12} | {'Score':<8} | {'Executable':<12} | {'Criteria':<30}")
    print("-" * 70)
    for result in results:
        print(f"{result['prog_id']:<12} | {result['score']:<8.1f} | {str(result['executable']):<12} | {result['criteria']:<30}")
    print("-" * 70)


if __name__ == "__main__":
    main()
