
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from generation.main import load_all_configs

import json
import os

from tqdm import tqdm

def process_submissions(input_path, output_path):
    # Load all configs
    configs = load_all_configs()
    print(f"Loaded {len(configs)} configs")
    # Load submissions
    with open(input_path, "r") as f:
        submissions = json.load(f)
    # Filter out any entry where response is a string and equals "error" (case-insensitive)
    submissions = [
        s for s in submissions
        if not (isinstance(s.get("response"), str) and s.get("response", "").strip().lower() == "error")
    ]

    print(f"Filtered {len(submissions)} submissions")

    count = 0
    true_count = 0
    false_count = 0
    error_count = 0

    for submission in tqdm(submissions, desc="Scoring submissions"):
        behavior = submission.get("behavior")
        response = submission.get("response", [])
        config = configs.get(behavior)
        if not config or not hasattr(config, "judge") or not callable(config.judge):
            # If no config or judge, skip scoring
            print(f"No config or judge for behavior {behavior}")
            continue
        if not response or len(response) < 1:
            # Not enough messages to judge
            continue
        # The first message is the system prompt
        system_prompt = response[0].get("content", "")
        # The rest are the messages
        messages = response[1:]
        # Call the judge
        try:
            judge_result = config.judge(system_prompt, messages)
        except Exception as e:
            judge_result = None
        # Store the result
        if "transfer_results" not in submission or not isinstance(submission["transfer_results"], dict):
            submission["transfer_results"] = {}
        submission["transfer_results"]["bool_score"] = judge_result
        count += 1

        if judge_result is True:
            true_count += 1
        elif judge_result is False:
            false_count += 1
        else:
            error_count += 1

    # Write to output file
    with open(output_path, "w") as f:
        json.dump(submissions, f, indent=2)
    print(f"Wrote {count} submissions to {output_path}")
    print(f"True: {true_count}, False: {false_count}, Error: {error_count}")
if __name__ == "__main__":
    # Example usage: adjust paths as needed
    import sys
    if len(sys.argv) < 3:
        print("Usage: python prog_judges.py <input_path> <output_path>")
        sys.exit(1)
    input_path = sys.argv[1]
    output_path = sys.argv[2]
    process_submissions(input_path, output_path)
