import argparse
import json
import os


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Aggregate benchmarking results (overall only)')
    parser.add_argument('--model_id', type=str, help='Model ID', default=None)
    args = parser.parse_args()

    model_id = args.model_id or "o4-mini"
    input_path = os.path.join(os.path.dirname(__file__), "results", model_id)

    all_scores = {
        'score': 0.0,
        'recall': 0.0,
        'precision': 0.0,
        'F': 0.0,
        'O': 0.0,
        'P': 0.0,
        'num_valid_score': 0,
        'num_valid_rubric': 0,
    }

    # Iterate over all json result files under the model directory
    if os.path.isdir(input_path):
        for name in os.listdir(input_path):
            if not name.endswith('.json'):
                continue
            fp = os.path.join(input_path, name)
            try:
                with open(fp, "r", encoding="utf-8") as f:
                    dataset = json.load(f)
            except Exception:
                continue

            benchmark_dataset = dataset.get("benchmark_dataset", [])
            judge_results = dataset.get("judge_results")
            rubric_results = dataset.get("logical_nexus_results") or dataset.get("rubric_results")
            if rubric_results is None:
                continue
            len_benchmark = min(len(benchmark_dataset), len(rubric_results))

            for i in range(len_benchmark):
                try:
                    logic_evaluation = rubric_results[i]
                    answer_evaluation = judge_results[i] if (judge_results is not None and i < len(judge_results)) else None
                    if answer_evaluation is not None:
                        all_scores['score'] += answer_evaluation.get('correct', 0)
                        all_scores['num_valid_score'] += 1
                    all_scores['F'] += logic_evaluation.get('F_score', 0.0)
                    all_scores['O'] += logic_evaluation.get('O_score', 0.0)
                    all_scores['P'] += logic_evaluation.get('P_score', 0.0)
                    all_scores['recall'] += logic_evaluation.get('recall', 0.0)
                    all_scores['precision'] += logic_evaluation.get('precision', 0.0)
                    all_scores['num_valid_rubric'] += 1
                except Exception:
                    continue

    if all_scores['num_valid_score'] > 0:
        all_scores['score'] /= all_scores['num_valid_score']
    if all_scores['num_valid_rubric'] > 0:
        all_scores['O'] /= all_scores['num_valid_rubric']
        all_scores['P'] /= all_scores['num_valid_rubric']
        all_scores['recall'] /= all_scores['num_valid_rubric']
        all_scores['precision'] /= all_scores['num_valid_rubric']
        if (all_scores['precision'] + all_scores['recall']) > 0:
            all_scores['F'] = 2 * all_scores['precision'] * all_scores['recall'] / (all_scores['precision'] + all_scores['recall'])

    print(f"\n===== Overall Results =====")
    print(f"recall: {100*all_scores['recall']:.2f}")
    print(f"precision: {100*all_scores['precision']:.2f}")
    print(f"F: {100*all_scores['F']:.2f}")
    print(f"O: {100*all_scores['O']:.2f}")
    print(f"P: {100*all_scores['P']:.2f}")
    print(f"score: {100*all_scores['score']:.2f}")


