import json
import argparse
import pandas as pd


def main():
    parser = argparse.ArgumentParser(description="Generate report from human annotation results.")
    parser.add_argument("--result", type=str, required=True, help='Path to consolidated human annotation jsonl')
    args = parser.parse_args()

    annotation_data = []
    with open(args.result, 'r') as f:
        for line in f:
            if line.strip():
                annotation_data.append(json.loads(line))

    # initialize a list
    report = [['Model', 'Question ID', 'Reasoning Accu', 'Correctness']]
    for data in annotation_data:
        model = data['src_file'].split('_')[-1].split('.')[0]
        reasoning_annotation = data['annotations']
        reasoning_accu_ct = 0
        final_answer_correctness = data['model_answer_correctness']
        for reasoning in reasoning_annotation:
            if reasoning['correct']:
                reasoning_accu_ct += 1
        reasoning_accu = reasoning_accu_ct / len(reasoning_annotation)
        report.append([model, data['qid'], reasoning_accu, final_answer_correctness])

    df = pd.DataFrame(report[1:], columns=report[0])
    # group by model and calculate the average reasoning accuracy and accuracy; also show number of support
    df_grouped = df.groupby('Model').agg(
        Reasoning_Accu=('Reasoning Accu', 'mean'),
        Correctness=('Correctness', 'mean'),
        Support=('Question ID', 'count')
    ).reset_index()
    # sort by reasoning accuracy
    df_grouped = df_grouped.sort_values(by='Reasoning_Accu', ascending=True)
    # format the Reasoning Accu and Correctness columns to 2 decimal places
    df_grouped['Reasoning_Accu'] = df_grouped['Reasoning_Accu'].apply(lambda x: f"{x * 100:.2f}")
    df_grouped['Correctness'] = df_grouped['Correctness'].apply(lambda x: f"{x * 100:.2f}")
    print(df_grouped.to_markdown(index=False))


if __name__ == "__main__":
    main()
