import argparse
import json
import logging
import os
import subprocess
import sys
from pathlib import Path

from transformers import set_seed
from utils import find_free_port

# Set up logging
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO)
logger = logging.getLogger(__name__)


def run_lm_eval(model: str, task, output_path: Path):
    """
    Run the lm-evaluation-harness for a specific model, task, and shot configuration.
    If the output file already exists, the evaluation is skipped.
    Displays output in real-time using subprocess.Popen.
    """
    logger.info(f"\n ########## Running evaluation for model '{model}' ##########\n")
    output_path.parent.mkdir(parents=True, exist_ok=True)

    cmd = ["lm_eval", "--model", "hf", "--model_args", f"pretrained={model}", "--tasks", task, "--device", "cuda", "--batch_size", "32", "--output_path", str(output_path)]

    logger.info(f"Running command: {' '.join(cmd)}")

    try:
        # Set MASTER_PORT to a free port
        os.environ["MASTER_PORT"] = str(find_free_port())
        # Use subprocess.Popen for real-time output
        with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, universal_newlines=True) as process:
            for line in iter(process.stdout.readline, ""):
                print(line, end="")  # リアルタイムで出力
                sys.stdout.flush()

        if process.returncode != 0:
            raise subprocess.CalledProcessError(process.returncode, cmd)

        logger.info(f"Results saved to {output_path}")

    except subprocess.CalledProcessError as e:
        logger.error(f"Error during evaluation: Command failed with exit code {e.returncode}")
        raise
    except Exception as e:
        logger.error(f"Unexpected error: {e}")
        raise


def parse_args():
    parser = argparse.ArgumentParser(description="Run evaluation on model and tasks.")

    parser.add_argument("--model_path", type=str, nargs="+", help="Path to the model directory.")
    parser.add_argument("--tasks", type=str, nargs="+", help="Task to evaluate.")
    parser.add_argument("--seed", type=int, help="Random seed.")
    parser.add_argument("--out_dir", type=str, help="Output directory to save evaluation results.")

    return parser.parse_args()


def main():
    """
    Main function to iterate over all model and task combinations,
    performing the evaluation using lm-evaluation-harness with task-specific shots.
    """
    args = parse_args()

    # Load config from json file
    with open("./configs/eval_performance.json", "r") as f:
        config = json.load(f)

    # Update config with command line arguments
    for arg, value in vars(args).items():
        if value is not None:
            config[arg] = value

    args = argparse.Namespace(**config)

    logger.info(f"Arguments: {args}")

    set_seed(args.seed)

    output_root = Path(args.out_dir).resolve()

    # Evaluate models
    for model in args.model_path:
        for task in args.tasks:
            output_path = output_root / model / task

            # Check if the directory already exists
            if output_path.exists():
                print(f"Directory {output_path} already exists. Skipping this task.")
                continue

            run_lm_eval(model, task, output_path)

    logger.info("Evaluation complete")


if __name__ == "__main__":
    main()
