#!/usr/bin/env python3


import argparse
import subprocess
import sys

# Dataset-specific configurations
DATASET_CONFIGS = {
    "codecontests": {
        "path": "datasets/codecontests",
        "test_column": "generated_tests",
        "format": "io",
        "prompt_column": "prompt"
    },
    "livecodebench": {
        "path": "datasets/livecodebench",
        "test_column": "public_tests",
        "format": "io",
        "prompt_column": "prompt"
    },
    "livebench": {
        "path": "datasets/livebench",
        "test_column": "public_test_cases",
        "format": "io",
        "prompt_column": "prompt"
    },
    "codeforces": {
        "path": "datasets/codeforces",
        "test_column": "tests",
        "format": "io",
        "prompt_column": "prompt"
    },
    "mbpp_io": {
        "path": "datasets/mbpp_io",
        "test_column": "tests",
        "format": "io",
        "prompt_column": "instruct_prompt"
    },
    "codecontests_codeflux": {
        "path": "datasets/codecontests_codeflux",
        "test_column": "public_tests",
        "format": "io",
        "prompt_column": "prompt"
    },
    "livecodebench_codeflux": {
        "path": "datasets/livecodebench_codeflux",
        "test_column": "public_tests",
        "format": "io",
        "prompt_column": "prompt"
    }
}


def parse_args() -> argparse.Namespace:
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="Run model evaluation across different datasets")

    parser.add_argument(
        "--model",
        type=str,
        required=True,
        help="Path to the model checkpoint"
    )
    parser.add_argument(
        "--dataset",
        type=str,
        required=True,
        choices=list(DATASET_CONFIGS.keys()),
        help="Dataset name to evaluate on"
    )
    parser.add_argument(
        "--gpus",
        type=str,
        required=True,
        help="Comma-separated list of GPU IDs to use"
    )
    parser.add_argument(
        "--k",
        type=int,
        default=256,
        help="Number of samples to generate per problem (default: 256)"
    )

    return parser.parse_args()


def run_evaluation(
        model_path: str,
        dataset: str,
        gpus: str,
        k: int
) -> None:
    """
    Run the evaluation with specified parameters.
    
    Args:
        model_path: Path to the model checkpoint
        dataset: Name of the dataset to evaluate on
        gpus: Comma-separated list of GPU IDs
        k: Number of samples per problem
    """
    if dataset not in DATASET_CONFIGS:
        print(f"Error: Unknown dataset '{dataset}'. Supported datasets: {', '.join(DATASET_CONFIGS.keys())}")
        sys.exit(1)

    config = DATASET_CONFIGS[dataset]

    print("Starting evaluation with:")
    print(f"  Model: {model_path}")
    print(f"  Dataset: {config['path']}")
    print(f"  GPUs: {gpus}")
    print(f"  K: {k}")
    print()

    cmd = [
        "poetry", "run", "python", "evaluate_pass_at_k.py",
        "--model", model_path,
        "--k", str(k),
        "--dataset_path", config["path"],
        "--test_column", config["test_column"],
        "--format", config["format"],
        "--visible_gpus", gpus,
        "--prompt_column", config["prompt_column"],
        "--tensor_parallel_size", str(len(gpus.split(",")))
    ]

    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error running evaluation: {e}")
        sys.exit(1)
    except KeyboardInterrupt:
        print("\nEvaluation interrupted by user")
        sys.exit(1)


def main():
    args = parse_args()
    run_evaluation(
        model_path=args.model,
        dataset=args.dataset,
        gpus=args.gpus,
        k=args.k
    )


if __name__ == "__main__":
    main()
