from benchmark.api import Evaluator
from pipeline.utils import print_colored
from benchmark.api import BenchmarkManager
manager = BenchmarkManager()

  
if __name__=="__main__":
    from argparse import ArgumentParser
    parser = ArgumentParser(description=__doc__)
    parser.add_argument(
        "--test_model_id",
        type=str,
        help="The model id of the test model",
        default="gpt-3.5-turbo", # gpt-3.5-turbo gpt-4o-mini "webpage-chatGPT-4-hard" "gpt-4-0125-preview" "webpage-GPT-4o-hard" "webpage-chatGPT-4-Knowledge-hard"
    )
    
    parser.add_argument(
        "--eval_model_id",
        type=str,
        help="The model id of the evaluation model",
        default="gpt-4-0125-preview", # "gpt-3.5-turbo" "gpt-4-0125-preview" "Meta-Llama-3-70B-Instruct" "gpt-4o"
    )
    
    parser.add_argument(
        "--stamp",
        type=str,
        help="The stamp to distinguish your run",
        default="run_2",
    )
    
    parser.add_argument(
        "--test_stamp",
        type=str,
        help="The stamp of the run of the test model",
        default=None
    )
    
    parser.add_argument(
        "--label_path",
        type=str,
        help="The path to the label file. If not provided, then we won't compare the results with the labels",
        default=None,
    )
    
    parser = parser.parse_args()   
    print_colored(f"Test model: {parser.test_model_id} | Eval model: {parser.eval_model_id}", "red")
    evaluator = Evaluator(test_model_id = parser.test_model_id, eval_model_id = parser.eval_model_id)
    evaluator.run_evaluation(label_path=parser.label_path, stamp=parser.stamp, test_stamp=parser.test_stamp) # set stamp to distinguish your run (NOTE: could continue the previous run), default is the current time
    
    # python -m pipeline.run_evaluation --label_path="pipeline/gpt_3.5_turbo_labels_run.json" --test_stamp="run" --stamp="run_5"
    # python -m pipeline.run_evaluation --label_path="pipeline/label.json"