import os
import sys
import subprocess


def main() -> None:
    base_dir = os.path.dirname(os.path.abspath(__file__))

    python_exe = sys.executable
    black_box_script = os.path.join(base_dir, "black_box_evals.py")
    method_plots_script = os.path.join(base_dir, "method_plots.py")
    input_file = os.path.join(base_dir, "datasets", "dataset_minimal.json")

    # Each tuple: (model_name, output_filename)
    model_runs = [
        ("deepseek/deepseek-r1-0528", os.path.join(base_dir, "eval_results", "deepseek_r1_all.json")),
        ("openai/gpt-5-mini", os.path.join(base_dir, "eval_results", "gpt_5_mini_all.json")),
        ("google/gemini-2.5-pro", os.path.join(base_dir, "eval_results", "gemini_2.5_pro_all.json")),
        ("openai/gpt-oss-120b", os.path.join(base_dir, "eval_results", "gpt_oss_120b_all.json")),
        ("qwen/qwen3-32b", os.path.join(base_dir, "eval_results", "qwen3_32b_all.json")),
    ]

    for model_name, output_path in model_runs:
        print("=" * 80)
        print(f"Starting evaluations for model: {model_name}")
        print("=" * 80)

        cmd = [
            python_exe,
            black_box_script,
            "--model", model_name,
            "--input_file", input_file,
            "--output_file", output_path,
            "--max_workers", "976",
            "--first_person",
            "--reasoning_traces_analysis",
            "--probability_eval_analysis",
            "--binary_eval_analysis",
            "--purpose_awareness_analysis",
        ]

        # Note: MAX_RETRIES=2 and JUDGE_MODEL=openai/gpt-5-mini are defaults inside black_box_evals.py
        subprocess.run(cmd, check=True, cwd=base_dir)

        print(f"Completed evaluations for model: {model_name}")

    print("All model evaluations finished. Generating plots...")
    subprocess.run([python_exe, method_plots_script], check=True, cwd=base_dir)
    print("Plot generation complete.")


if __name__ == "__main__":
    main()


