import json

def get_final_dataset(input_path, annotation_path):
    with open(input_path, 'r') as f:
        dataset = json.load(f)
    with open(annotation_path, 'r') as f:
        annotations = json.load(f)
    info = []
    if 'geo' in input_path or 'his' in input_path or 'pol' in input_path:
        id = 0
        for data, annotation in zip(dataset, annotations):
            human_preference_human_a_vs_human_b = annotation['student_answer_a vs student_answer_b_new']
            human_preference_human_a_vs_model_a = annotation['stud_answer_a vs model_answer_a_new']
            human_preference_human_a_vs_model_b = annotation['stud_answer_a vs model_answer_b_new']
            human_preference_model_a_vs_model_b = annotation['model_answer_a vs model_answer_b_new']
            info.append(
                {
                    "id": id,
                    "context": data['context'],
                    "question": data['Question'],
                    "reference": data['Reference'],
                    "key_information": data['Concise_Reference'],
                    "student_answer_a": data['refined_student_answer_a'],
                    "student_answer_b": data['refined_student_answer_b'],
                    "answer_a_type": "human",   
                    "answer_b_type": "human",
                    "ROUGE_preference": data['ROUGE_score_human_vs_human'],
                    "BLEU_preference": data['BLEU_score_human_vs_human'],
                    "GPT2_preference": data['GPT2_human_vs_human'],
                    "BLEURT_preference": data['BLEURT_score_human_vs_human'],
                    "UNIEVAL_preference": data['UNIEVAL_score_human_vs_human'],
                    "BERT_preference": data['BERT_score_human_vs_human'],
                    "BART_preference": data['BART_score_human_vs_human'],
                    "AutoJ_preference": data['autoJ_human_a_vs_human_b'],
                    "Critique_preference": data['Critique_human_a_vs_human_b'],
                    "Tiger_preference": data['TIGER_human_a_vs_human_b'],
                    "ChatGPT_preference": data['ChatGPT_human_a_vs_human_b'],
                    "GPT-4o_preference": data['GPT-4o_human_a_vs_human_b'],
                    "Human_preference": human_preference_human_a_vs_human_b,
                }
            )
            id += 1
            info.append(
                {
                    "id": id,
                    "context": data['context'],
                    "question": data['Question'],
                    "reference": data['Reference'],
                    "key_information": data['Concise_Reference'],
                    "student_answer_a": data['refined_student_answer_a'],
                    "student_answer_b": data['model_answer_a'],
                    "answer_a_type": "human",   
                    "answer_b_type": "model",
                    "ROUGE_preference": data['ROUGE_score_human_a_vs_model_a'],
                    "BLEU_preference": data['BLEU_score_human_a_vs_model_a'],
                    "GPT2_preference": data['GPT2_human_a_vs_model_a'],
                    "BLEURT_preference": data['BLEURT_score_human_a_vs_model_a'],
                    "UNIEVAL_preference": data['UNIEVAL_score_human_a_vs_model_a'],
                    "BERT_preference": data['BERT_score_human_a_vs_model_a'],
                    "BART_preference": data['BART_score_human_a_vs_model_a'],
                    "AutoJ_preference": data['autoJ_human_a_vs_model_a'],
                    "Critique_preference": data['Critique_human_a_vs_model_a'],
                    "Tiger_preference": data['TIGER_human_a_vs_model_a'],
                    "ChatGPT_preference": data['ChatGPT_human_a_vs_model_a'],
                    "GPT-4o_preference": data['GPT-4o_human_a_vs_model_a'],
                    "Human_preference": human_preference_human_a_vs_model_a,
                }
            )
            id += 1
            info.append(
                {
                    "id": id,
                    "context": data['context'],
                    "question": data['Question'],
                    "reference": data['Reference'],
                    "key_information": data['Concise_Reference'],
                    "student_answer_a": data['refined_student_answer_a'],
                    "student_answer_b": data['model_answer_b'],
                    "answer_a_type": "human",   
                    "answer_b_type": "model",
                    "ROUGE_preference": data['ROUGE_score_human_b_vs_model_b'],
                    "BLEU_preference": data['BLEU_score_human_b_vs_model_b'],
                    "GPT2_preference": data['GPT2_human_b_vs_model_b'],
                    "BLEURT_preference": data['BLEURT_score_human_b_vs_model_b'],
                    "UNIEVAL_preference": data['UNIEVAL_score_human_b_vs_model_b'],
                    "BERT_preference": data['BERT_score_human_b_vs_model_b'],
                    "BART_preference": data['BART_score_human_b_vs_model_b'],
                    "AutoJ_preference": data['autoJ_human_a_vs_model_b'],
                    "Critique_preference": data['Critique_human_a_vs_model_b'],
                    "Tiger_preference": data['TIGER_human_a_vs_model_b'],
                    "ChatGPT_preference": data['ChatGPT_human_a_vs_model_b'],
                    "GPT-4o_preference": data['GPT-4o_human_a_vs_model_b'],
                    "Human_preference": human_preference_human_a_vs_model_b,
                }
            )
            id += 1
            info.append(
                {
                    "id": id,
                    "context": data['context'],
                    "question": data['Question'],
                    "reference": data['Reference'],
                    "key_information": data['Concise_Reference'],
                    "student_answer_a": data['model_answer_a'],
                    "student_answer_b": data['model_answer_b'],
                    "answer_a_type": "model",   
                    "answer_b_type": "model",
                    "ROUGE_preference": data['ROUGE_score_model_vs_model'],
                    "BLEU_preference": data['BLEU_score_model_vs_model'],
                    "GPT2_preference": data['GPT2_model_vs_model'],
                    "BLEURT_preference": data['BLEURT_score_model_vs_model'],
                    "UNIEVAL_preference": data['UNIEVAL_score_model_vs_model'],
                    "BERT_preference": data['BERT_score_model_vs_model'],
                    "BART_preference": data['BART_score_model_vs_model'],
                    "AutoJ_preference": data['autoJ_model_a_vs_model_b'],
                    "Critique_preference": data['Critique_model_a_vs_model_b'],
                    "Tiger_preference": data['TIGER_model_a_vs_model_b'],
                    "ChatGPT_preference": data['ChatGPT_model_a_vs_model_b'],
                    "GPT-4o_preference": data['GPT-4o_model_a_vs_model_b'],
                    "Human_preference": human_preference_model_a_vs_model_b,
                }
            )
            id += 1
    else:
        id = 0
        for data, annotation in zip(dataset, annotations):
            human_preference_model_a_vs_model_b = annotation['model_answer_a vs model_answer_b']
            info.append(
                {
                    "id": data['id'],
                    "context": data['context'],
                    "question": data['Question'],
                    "reference": data['Reference'],
                    "key_information": data['Concise_Reference'],
                    "student_answer_a": data['model_answer_a'],
                    "student_answer_b": data['model_answer_b'],
                    "answer_a_type": "model",   
                    "answer_b_type": "model",
                    "ROUGE_preference": data['ROUGE_score_model_vs_model'],
                    "BLEU_preference": data['BLEU_score_model_vs_model'],
                    "GPT2_preference": data['GPT2_model_vs_model'],
                    "BLEURT_preference": data['BLEURT_score_model_vs_model'],
                    "UNIEVAL_preference": data['UNIEVAL_score_model_vs_model'],
                    "BERT_preference": data['BERT_score_model_vs_model'],
                    "BART_preference": data['BART_score_model_vs_model'],
                    "AutoJ_preference": data['autoJ_model_a_vs_model_b'],
                    "Critique_preference": data['Critique_model_a_vs_model_b'],
                    "Tiger_preference": data['TIGER_model_a_vs_model_b'],
                    "ChatGPT_preference": data['ChatGPT_model_a_vs_model_b'],
                    "GPT-4o_preference": data['GPT-4o_model_a_vs_model_b'],
                    "Human_preference": human_preference_model_a_vs_model_b,
                }
            )
            id += 1
            
    output_path = input_path.replace('.json', '_final.json')
    with open(output_path, 'w') as f:
        json.dump(info, f, indent=4)
    
get_final_dataset('../benchmark/english/hisen_llm_baseline.json', '../benchmark/annotation/hisen.json')

def clear_output(input_path):
    with open(input_path, 'r') as f:
        dataset = json.load(f)
    
    for data in dataset:
        # Adjust preferences for human vs. human comparisons
        if data['answer_a_type'] == 'human' and data['answer_b_type'] == 'human':
            for metric in ['ROUGE', 'BLEU', 'GPT2', 'BLEURT', 'UNIEVAL', 'BERT', 'BART', 'AutoJ', 'Critique', 'Tiger', 'ChatGPT', 'GPT-4o']:
                preference_key = f'{metric}_preference'
                if data[preference_key] == 'human_a':
                    data[preference_key] = 'a'
                elif data[preference_key] == 'human_b':
                    data[preference_key] = 'b'
                else:
                    data[preference_key] = 'tie'

        # Adjust preferences for model vs. model comparisons
        elif data['answer_a_type'] == 'model' and data['answer_b_type'] == 'model':
            for metric in ['ROUGE', 'BLEU', 'GPT2', 'BLEURT', 'UNIEVAL', 'BERT', 'BART', 'AutoJ', 'Critique', 'Tiger', 'ChatGPT', 'GPT-4o']:
                preference_key = f'{metric}_preference'
                if data[preference_key] == 'model_a':
                    data[preference_key] = 'a'
                elif data[preference_key] == 'model_b':
                    data[preference_key] = 'b'
                else:
                    data[preference_key] = 'tie'
                    
        else:
            for metric in ['ROUGE', 'BLEU', 'GPT2', 'BLEURT', 'UNIEVAL', 'BERT', 'BART', 'AutoJ', 'Critique', 'Tiger', 'ChatGPT', 'GPT-4o']:
                preference_key = f'{metric}_preference'
                if data[preference_key] == 'human_a':
                    data[preference_key] = 'a'
                elif data[preference_key] == 'model_b':
                    data[preference_key] = 'b'
                elif data[preference_key] == 'model_a':
                    data[preference_key] = 'b'
                else:
                    data[preference_key] = 'tie'
    
    # Save the modified dataset to a new file
    output_path = input_path.replace('.json', '_cleared.json')
    with open(output_path, 'w') as f:
        json.dump(dataset, f, indent=4)
        
clear_output('../benchmark/english/hisen_llm_baseline_final.json')



