"""
Evaluate the agent based on a set of trajectories.

Usage: python scripts/evaluate.py -inp <input_trajectories_file> -ev <evaluator_type>
"""

import argparse
import random

from dotenv import load_dotenv
from toolemu.dataloader import DataLoader
from toolemu.evaluators import EVALUATORS
from toolemu.executors import FuncExecutor
from toolemu.utils import (
    llm_register_args,
    load_openai_llm_with_args,
    print_intermediate_result_and_stop,
)
from toolemu.utils.my_typing import *

load_dotenv()
parser = argparse.ArgumentParser()
parser.add_argument(
    "--eval-type",
    "-ev",
    type=str,
    default="agent_safe",
    choices=EVALUATORS.keys(),
)
args_, _ = parser.parse_known_args()

evaluator_class = EVALUATORS[args_.eval_type]
llm_register_args(parser, prefix="evaluator")
evaluator_class.register_args(parser)
DataLoader.register_args(parser)
FuncExecutor.register_args(
    parser, default_num_retries=0, default_batch_size=5, default_timeout=600
)
parser.add_argument("--output-file-prefix", "-out", type=str, default=None)
parser.add_argument("--eval-results-out-suffix", "-es", type=str, default="")
parser.add_argument("--random-seed", "-seed", type=int, default=42)

args = parser.parse_args()
random.seed(args.random_seed)


def main():
    trajs = DataLoader.from_args(args, return_mode="with_idx", item_name="trajectory")
    output_file_prefix = args.output_file_prefix or trajs.base_path
    output_path = f"{output_file_prefix}_eval{args.eval_results_out_suffix}_{args.eval_type}.jsonl"

    if args.critique_rounds > 0:
        raise ValueError("Evaluation does not support critique rounds yet.")
    evaluator_llm = load_openai_llm_with_args(args, prefix="evaluator")
    evaluator = evaluator_class.from_args(args, evaluator_llm)

    if evaluator._stop_at in ["preprocess", "prompt"]:
        result = evaluator({"trajectory": trajs[0]["item"]})
        print_intermediate_result_and_stop([result], evaluator._stop_at)

    def evaluate_trajec(traj_with_idx):
        traj_idx, traj = traj_with_idx["idx"], traj_with_idx["item"]
        try:
            results = evaluator({"trajectory": traj})
            result = results[0]  # TODO: support merge multiple rounds of critiques
        except Exception as e:
            result = {"error": str(e)}
            return traj_with_idx, result
        result["eval_id"] = traj_idx
        return None, result

    runner = FuncExecutor.from_args(args)
    runner.run(evaluate_trajec, trajs, output_path)
    print(
        "You may want to use scripts to convert the result jsonl file "
        f"{output_path} to json for easier reading."
    )


if __name__ == "__main__":
    main()
