from typing import Counter
from bert_score import score
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import jieba
from rouge_chinese import Rouge
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.nn import functional as F
from BARTScore.bart_score import BARTScorer
from bleurt_pytorch import BleurtConfig, BleurtForSequenceClassification, BleurtTokenizer
from UniEval.utils import convert_to_json
from UniEval.metric.evaluator import get_evaluator
from transfer_openai import send_request, send_request_sb
from tqdm import tqdm

def bert_calculation():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        refs = []
        hyps_human_a = []
        hyps_human_b = []
        hyps_model_a = []
        hyps_model_b = []
        for data in dataset:
            refs.append(data['Reference'])
            hyp_human_a = data['refined_student_answer_a']
            hyp_human_b = data['refined_student_answer_b']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyps_human_a.append(hyp_human_a)
            hyps_human_b.append(hyp_human_b)
            hyps_model_a.append(hyp_model_a)
            hyps_model_b.append(hyp_model_b)
        bert_scores_human_a = score(hyps_human_a, refs, lang='en', verbose=False)
        bert_scores_human_b = score(hyps_human_b, refs, lang='en', verbose=False)
        bert_scores_model_a = score(hyps_model_a, refs, lang='en', verbose=False)
        bert_scores_model_b = score(hyps_model_b, refs, lang='en', verbose=False)
        for i, data in enumerate(dataset):
            data['BERT_score_human_a'] = bert_scores_human_a[2][i].item()
            data['BERT_score_human_b'] = bert_scores_human_b[2][i].item()
            data['BERT_score_model_a'] = bert_scores_model_a[2][i].item()
            data['BERT_score_model_b'] = bert_scores_model_b[2][i].item()
            
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        hyps_model_a = []
        hyps_model_b = []
        refs = []
        for data in dataset:
            refs.append(data['Reference'])
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyps_model_a.append(hyp_model_a)
            hyps_model_b.append(hyp_model_b)
        bert_scores_model_a = score(hyps_model_a, refs, lang='en', verbose=False)
        bert_scores_model_b = score(hyps_model_b, refs, lang='en', verbose=False)
        for i, data in enumerate(dataset):
            data['BERT_score_model_a'] = bert_scores_model_a[2][i].item()
            data['BERT_score_model_b'] = bert_scores_model_b[2][i].item()
            
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))

def rouge_calculation():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            ref = data['Reference']
            hyp_human_a = data['refined_student_answer_a']
            hyp_human_b = data['refined_student_answer_b']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge2', 'rougeL'], use_stemmer=True)
            scores_human_a = scorer.score(ref, hyp_human_a)
            scores_human_b = scorer.score(ref, hyp_human_b)
            scores_model_a = scorer.score(ref, hyp_model_a)
            scores_model_b = scorer.score(ref, hyp_model_b)
            data['ROUGE_score_human_a'] = scores_human_a
            data['ROUGE_score_human_b'] = scores_human_b
            data['ROUGE_score_model_a'] = scores_model_a
            data['ROUGE_score_model_b'] = scores_model_b
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            ref = data['Reference']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            scorer = rouge_scorer.RougeScorer(['rougeL', 'rouge2', 'rougeL'], use_stemmer=True)
            scores_model_a = scorer.score(ref, hyp_model_a)
            scores_model_b = scorer.score(ref, hyp_model_b)
            data['ROUGE_score_model_a'] = scores_model_a
            data['ROUGE_score_model_b'] = scores_model_b
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
              
def bleu_calculation():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            ref = data['Reference']
            hyp_human_a = data['refined_student_answer_a']
            hyp_human_b = data['refined_student_answer_b']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            smooth = SmoothingFunction().method4
            score_human_a = sentence_bleu([ref.split()], hyp_human_a.split(), smoothing_function=smooth)
            score_human_b = sentence_bleu([ref.split()], hyp_human_b.split(), smoothing_function=smooth)
            score_model_a = sentence_bleu([ref.split()], hyp_model_a.split(), smoothing_function=smooth)
            score_model_b = sentence_bleu([ref.split()], hyp_model_b.split(), smoothing_function=smooth)
            data['BLEU_score_human_a'] = score_human_a
            data['BLEU_score_human_b'] = score_human_b
            data['BLEU_score_model_a'] = score_model_a
            data['BLEU_score_model_b'] = score_model_b
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            ref = data['Reference']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            smooth = SmoothingFunction().method4
            score_model_a = sentence_bleu([ref.split()], hyp_model_a.split(), smoothing_function=smooth)
            score_model_b = sentence_bleu([ref.split()], hyp_model_b.split(), smoothing_function=smooth)
            data['BLEU_score_model_a'] = score_model_a
            data['BLEU_score_model_b'] = score_model_b
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
            
def bleurt_calculation():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    model = BleurtForSequenceClassification.from_pretrained('lucadiliello/BLEURT-20')
    tokenizer = BleurtTokenizer.from_pretrained('lucadiliello/BLEURT-20')
    model.eval()
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        refs = []
        hyps_human_a = []
        hyps_human_b = []
        hyps_model_a = []
        hyps_model_b = []
        for data in dataset:
            refs.append(data['Reference'])
            hyp_human_a = data['refined_student_answer_a']
            hyp_human_b = data['refined_student_answer_b']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyps_human_a.append(hyp_human_a)
            hyps_human_b.append(hyp_human_b)
            hyps_model_a.append(hyp_model_a)
            hyps_model_b.append(hyp_model_b)
        with torch.no_grad():
            inputs_human_a = tokenizer(refs, hyps_human_a, padding='longest', return_tensors='pt')
            scores_human_a = model(**inputs_human_a).logits.flatten().tolist()
            inputs_human_b = tokenizer(refs, hyps_human_b, padding='longest', return_tensors='pt')
            scores_human_b = model(**inputs_human_b).logits.flatten().tolist()
            inputs_model_a = tokenizer(refs, hyps_model_a, padding='longest', return_tensors='pt')
            scores_model_a = model(**inputs_model_a).logits.flatten().tolist()
            inputs_model_b = tokenizer(refs, hyps_model_b, padding='longest', return_tensors='pt')
            scores_model_b = model(**inputs_model_b).logits.flatten().tolist()
        for i, data in enumerate(dataset):
            data['BLEURT_score_human_a'] = scores_human_a[i]
            data['BLEURT_score_human_b'] = scores_human_b[i]
            data['BLEURT_score_model_a'] = scores_model_a[i]
            data['BLEURT_score_model_b'] = scores_model_b[i]
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        refs = []
        hyps_model_a = []
        hyps_model_b = []
        for data in dataset:
            refs.append(data['Reference'])
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyps_model_a.append(hyp_model_a)
            hyps_model_b.append(hyp_model_b)
        with torch.no_grad():
            inputs_model_a = tokenizer(refs, hyps_model_a, padding='longest', return_tensors='pt')
            scores_model_a = model(**inputs_model_a).logits.flatten().tolist()
            inputs_model_b = tokenizer(refs, hyps_model_b, padding='longest', return_tensors='pt')
            scores_model_b = model(**inputs_model_b).logits.flatten().tolist()
        for i, data in enumerate(dataset):
            data['BLEURT_score_model_a'] = scores_model_a[i]
            data['BLEURT_score_model_b'] = scores_model_b[i]
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))

def bart_calculation():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    model = BARTScorer()
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        refs = []
        hyps_human_a = []
        hyps_human_b = []
        hyps_model_a = []
        hyps_model_b = []
        for data in dataset:
            refs.append(data['Reference'])
            hyp_human_a = data['refined_student_answer_a']
            hyp_human_b = data['refined_student_answer_b']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyps_human_a.append(hyp_human_a)
            hyps_human_b.append(hyp_human_b)
            hyps_model_a.append(hyp_model_a)
            hyps_model_b.append(hyp_model_b)
        print("Here")
        bart_scores_human_a = model.score(hyps_human_a, refs)
        bart_scores_human_b = model.score(hyps_human_b, refs)
        bart_scores_model_a = model.score(hyps_model_a, refs)
        bart_scores_model_b = model.score(hyps_model_b, refs)
        for i, data in enumerate(dataset):
            data['BART_score_human_a'] = bart_scores_human_a[i]
            data['BART_score_human_b'] = bart_scores_human_b[i]
            data['BART_score_model_a'] = bart_scores_model_a[i]
            data['BART_score_model_b'] = bart_scores_model_b[i]
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        refs = []
        hyps_model_a = []
        hyps_model_b = []
        for data in dataset:
            refs.append(data['Reference'])
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyps_model_a.append(hyp_model_a)
            hyps_model_b.append(hyp_model_b)
        bart_scores_model_a = model.score(hyps_model_a, refs)
        bart_scores_model_b = model.score(hyps_model_b, refs)
        for i, data in enumerate(dataset):
            data['BART_score_model_a'] = bart_scores_model_a[i]
            data['BART_score_model_b'] = bart_scores_model_b[i]
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))

def calculate_chinese_rouge():
    with open('../dataset/polich.jsonl', 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]
    
    correct = []
    for data in dataset:
        ref = data.get('Reference')
        hyp_a = data.get('Student_Answer_a')
        hyp_b = data.get('Student_Answer_b')
        preference = data.get('Preference')

        ref_tokens = ' '.join(jieba.cut(ref))
        hyp_a_tokens = ' '.join(jieba.cut(hyp_a))
        hyp_b_tokens = ' '.join(jieba.cut(hyp_b))
        rouge = Rouge()
        
        scores_a = rouge.get_scores(ref_tokens, hyp_a_tokens)[0]['rouge-1'][2]
        scores_b = rouge.get_scores(ref_tokens, hyp_b_tokens)[0]['rouge-1'][2]

        if scores_a > scores_b:
            if preference == 'a':
                correct.append(1)
            else:
                correct.append(0)
        elif scores_a < scores_b:
            if preference == 'b':
                correct.append(1)
            else:
                correct.append(0)
        

    if correct:
        accuracy = sum(correct) / len(correct) * 100
        print(f"Accuracy: {accuracy:.2f}%")

def calculate_perplexity(tokens_tensor, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, labels=tokens_tensor)
        loss = outputs.loss
    return torch.exp(loss)

def gpt2_ppl():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        for data in dataset:
            ref = data['Reference']
            hyp_human_a = data['refined_student_answer_a']
            hyp_human_b = data['refined_student_answer_b']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyp_human_a_tokens = tokenizer.encode(hyp_human_a, return_tensors='pt', truncation=True)
            hyp_human_b_tokens = tokenizer.encode(hyp_human_b, return_tensors='pt', truncation=True)
            hyp_model_a_tokens = tokenizer.encode(hyp_model_a, return_tensors='pt', truncation=True)
            hyp_model_b_tokens = tokenizer.encode(hyp_model_b, return_tensors='pt', truncation=True)
            ppl_human_a = calculate_perplexity(hyp_human_a_tokens, model)
            ppl_human_b = calculate_perplexity(hyp_human_b_tokens, model)
            ppl_model_a = calculate_perplexity(hyp_model_a_tokens, model)
            ppl_model_b = calculate_perplexity(hyp_model_b_tokens, model)
            data['GPT2_ppl_human_a'] = ppl_human_a.item()
            data['GPT2_ppl_human_b'] = ppl_human_b.item()
            data['GPT2_ppl_model_a'] = ppl_model_a.item()
            data['GPT2_ppl_model_b'] = ppl_model_b.item()
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        for data in dataset:
            ref = data['Reference']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyp_model_a_tokens = tokenizer.encode(hyp_model_a, return_tensors='pt', truncation=True)
            hyp_model_b_tokens = tokenizer.encode(hyp_model_b, return_tensors='pt', truncation=True)
            ppl_model_a = calculate_perplexity(hyp_model_a_tokens, model)
            ppl_model_b = calculate_perplexity(hyp_model_b_tokens, model)
            data['GPT2_ppl_model_a'] = ppl_model_a.item()
            data['GPT2_ppl_model_b'] = ppl_model_b.item()
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
            
def generate_gpt2_compare():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            GPT2_ppl_human_a = data['GPT2_ppl_human_a']
            GPT2_ppl_human_b = data['GPT2_ppl_human_b']
            GPT2_ppl_model_a = data['GPT2_ppl_model_a']
            GPT2_ppl_model_b = data['GPT2_ppl_model_b']
            if GPT2_ppl_human_a < GPT2_ppl_human_b:
                data['GPT2_human_a_vs_human_b'] = 'human_a'
            elif GPT2_ppl_human_a > GPT2_ppl_human_b:
                data['GPT2_human_a_vs_human_b'] = 'human_b'
            else:
                data['GPT2_human_a_vs_human_b'] = 'tie'
            if GPT2_ppl_human_a < GPT2_ppl_model_a:
                data['GPT2_human_a_vs_model_a'] = 'human_a'
            elif GPT2_ppl_human_a > GPT2_ppl_model_a:
                data['GPT2_human_a_vs_model_a'] = 'model_a'
            else:
                data['GPT2_human_a_vs_model_a'] = 'tie'
            if GPT2_ppl_human_a < GPT2_ppl_model_b:
                data['GPT2_human_a_vs_model_b'] = 'human_a'
            elif GPT2_ppl_human_a > GPT2_ppl_model_b:
                data['GPT2_human_a_vs_model_b'] = 'model_b'
            else:
                data['GPT2_human_a_vs_model_b'] = 'tie'
            if GPT2_ppl_model_a < GPT2_ppl_model_b:
                data['GPT2_model_a_vs_model_b'] = 'model_a'
            elif GPT2_ppl_model_a > GPT2_ppl_model_b:
                data['GPT2_model_a_vs_model_b'] = 'model_b'
            else:
                data['GPT2_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            GPT2_ppl_model_a = data['GPT2_ppl_model_a']
            GPT2_ppl_model_b = data['GPT2_ppl_model_b']
            if GPT2_ppl_model_a < GPT2_ppl_model_b:
                data['GPT2_model_a_vs_model_b'] = 'model_a'
            elif GPT2_ppl_model_a > GPT2_ppl_model_b:
                data['GPT2_model_a_vs_model_b'] = 'model_b'
            else:
                data['GPT2_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
            

def generate_tiger_compare():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            TIGER_score_human_a = data['TIGER_score_human_a'] if data['TIGER_score_human_a'] else 0
            TIGER_score_human_b = data['TIGER_score_human_b'] if data['TIGER_score_human_b'] else 0
            TIGER_score_model_a = data['TIGER_score_model_a'] if data['TIGER_score_model_a'] else 0
            TIGER_score_model_b = data['TIGER_score_model_b'] if data['TIGER_score_model_b'] else 0
            if TIGER_score_human_a > TIGER_score_human_b:
                data['TIGER_human_a_vs_human_b'] = 'human_a'
            elif TIGER_score_human_a < TIGER_score_human_b:
                data['TIGER_human_a_vs_human_b'] = 'human_b'
            else:
                data['TIGER_human_a_vs_human_b'] = 'tie'
            if TIGER_score_human_a > TIGER_score_model_a:
                data['TIGER_human_a_vs_model_a'] = 'human_a'
            elif TIGER_score_human_a < TIGER_score_model_a:
                data['TIGER_human_a_vs_model_a'] = 'model_a'
            else:
                data['TIGER_human_a_vs_model_a'] = 'tie'
            if TIGER_score_human_a > TIGER_score_model_b:
                data['TIGER_human_a_vs_model_b'] = 'human_a'
            elif TIGER_score_human_a < TIGER_score_model_b:
                data['TIGER_human_a_vs_model_b'] = 'model_b'
            else:
                data['TIGER_human_a_vs_model_b'] = 'tie'
            if TIGER_score_model_a > TIGER_score_model_b:
                data['TIGER_model_a_vs_model_b'] = 'model_a'
            elif TIGER_score_model_a < TIGER_score_model_b:
                data['TIGER_model_a_vs_model_b'] = 'model_b'
            else:
                data['TIGER_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            TIGER_score_model_a = data['TIGER_score_model_a'] if data['TIGER_score_model_a'] else 0
            TIGER_score_model_b = data['TIGER_score_model_b'] if data['TIGER_score_model_b'] else 0
            if TIGER_score_model_a > TIGER_score_model_b:
                data['TIGER_model_a_vs_model_b'] = 'model_a'
            elif TIGER_score_model_a < TIGER_score_model_b:
                data['TIGER_model_a_vs_model_b'] = 'model_b'
            else:
                data['TIGER_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))

def generate_bert_score_compare():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            BERT_score_human_a = data['BERT_score_human_a']
            BERT_score_human_b = data['BERT_score_human_b']
            BERT_score_model_a = data['BERT_score_model_a']
            BERT_score_model_b = data['BERT_score_model_b']
            if BERT_score_human_a > BERT_score_human_b:
                data['BERT_human_a_vs_human_b'] = 'human_a'
            elif BERT_score_human_a < BERT_score_human_b:
                data['BERT_human_a_vs_human_b'] = 'human_b'
            else:
                data['BERT_human_a_vs_human_b'] = 'tie'
            if BERT_score_human_a > BERT_score_model_a:
                data['BERT_human_a_vs_model_a'] = 'human_a'
            elif BERT_score_human_a < BERT_score_model_a:
                data['BERT_human_a_vs_model_a'] = 'model_a'
            else:
                data['BERT_human_a_vs_model_a'] = 'tie'
            if BERT_score_human_a > BERT_score_model_b:
                data['BERT_human_a_vs_model_b'] = 'human_a'
            elif BERT_score_human_a < BERT_score_model_b:
                data['BERT_human_a_vs_model_b'] = 'model_b'
            else:
                data['BERT_human_a_vs_model_b'] = 'tie'
            if BERT_score_model_a > BERT_score_model_b:
                data['BERT_model_a_vs_model_b'] = 'model_a'
            elif BERT_score_model_a < BERT_score_model_b:
                data['BERT_model_a_vs_model_b'] = 'model_b'
            else:
                data['BERT_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            BERT_score_model_a = data['BERT_score_model_a']
            BERT_score_model_b = data['BERT_score_model_b']
            if BERT_score_model_a > BERT_score_model_b:
                data['BERT_model_a_vs_model_b'] = 'model_a'
            elif BERT_score_model_a < BERT_score_model_b:
                data['BERT_model_a_vs_model_b'] = 'model_b'
            else:
                data['BERT_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    
def generate_rouge_score_compare():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            ROUGE_score_human_a = data['ROUGE_score_human_a']
            ROUGE_score_human_b = data['ROUGE_score_human_b']
            ROUGE_score_model_a = data['ROUGE_score_model_a']
            ROUGE_score_model_b = data['ROUGE_score_model_b']
            if ROUGE_score_human_a['rougeL'][2] > ROUGE_score_human_b['rougeL'][2]:
                data['ROUGE_human_a_vs_human_b'] = 'human_a'
            elif ROUGE_score_human_a['rougeL'][2] < ROUGE_score_human_b['rougeL'][2]:
                data['ROUGE_human_a_vs_human_b'] = 'human_b'
            else:
                data['ROUGE_human_a_vs_human_b'] = 'tie'
            if ROUGE_score_human_a['rougeL'][2] > ROUGE_score_model_a['rougeL'][2]:
                data['ROUGE_human_a_vs_model_a'] = 'human_a'
            elif ROUGE_score_human_a['rougeL'][2] < ROUGE_score_model_a['rougeL'][2]:
                data['ROUGE_human_a_vs_model_a'] = 'model_a'
            else:
                data['ROUGE_human_a_vs_model_a'] = 'tie'
            if ROUGE_score_human_a['rougeL'][2] > ROUGE_score_model_b['rougeL'][2]:
                data['ROUGE_human_a_vs_model_b'] = 'human_a'
            elif ROUGE_score_human_a['rougeL'][2] < ROUGE_score_model_b['rougeL'][2]:
                data['ROUGE_human_a_vs_model_b'] = 'model_b'
            else:
                data['ROUGE_human_a_vs_model_b'] = 'tie'
            if ROUGE_score_model_a['rougeL'][2] > ROUGE_score_model_b['rougeL'][2]:
                data['ROUGE_model_a_vs_model_b'] = 'model_a'
            elif ROUGE_score_model_a['rougeL'][2] < ROUGE_score_model_b['rougeL'][2]:
                data['ROUGE_model_a_vs_model_b'] = 'model_b'
            else:
                data['ROUGE_model_a_vs_model_b'] = 'tie'
                
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            ROUGE_score_model_a = data['ROUGE_score_model_a']
            ROUGE_score_model_b = data['ROUGE_score_model_b']
            if ROUGE_score_model_a['rougeL'][2] > ROUGE_score_model_b['rougeL'][2]:
                data['ROUGE_model_a_vs_model_b'] = 'model_a'
            elif ROUGE_score_model_a['rougeL'][2] < ROUGE_score_model_b['rougeL'][2]:
                data['ROUGE_model_a_vs_model_b'] = 'model_b'
            else:
                data['ROUGE_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
            
def generate_bleu_score_compare():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            BLEU_score_human_a = data['BLEU_score_human_a']
            BLEU_score_human_b = data['BLEU_score_human_b']
            BLEU_score_model_a = data['BLEU_score_model_a']
            BLEU_score_model_b = data['BLEU_score_model_b']
            if BLEU_score_human_a > BLEU_score_human_b:
                data['BLEU_human_a_vs_human_b'] = 'human_a'
            elif BLEU_score_human_a < BLEU_score_human_b:
                data['BLEU_human_a_vs_human_b'] = 'human_b'
            else:
                data['BLEU_human_a_vs_human_b'] = 'tie'
            if BLEU_score_human_a > BLEU_score_model_a:
                data['BLEU_human_a_vs_model_a'] = 'human_a'
            elif BLEU_score_human_a < BLEU_score_model_a:
                data['BLEU_human_a_vs_model_a'] = 'model_a'
            else:
                data['BLEU_human_a_vs_model_a'] = 'tie'
            if BLEU_score_human_a > BLEU_score_model_b:
                data['BLEU_human_a_vs_model_b'] = 'human_a'
            elif BLEU_score_human_a < BLEU_score_model_b:
                data['BLEU_human_a_vs_model_b'] = 'model_b'
            else:
                data['BLEU_human_a_vs_model_b'] = 'tie'
            if BLEU_score_model_a > BLEU_score_model_b:
                data['BLEU_model_a_vs_model_b'] = 'model_a'
            elif BLEU_score_model_a < BLEU_score_model_b:
                data['BLEU_model_a_vs_model_b'] = 'model_b'
            else:
                data['BLEU_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            BLEU_score_model_a = data['BLEU_score_model_a']
            BLEU_score_model_b = data['BLEU_score_model_b']
            if BLEU_score_model_a > BLEU_score_model_b:
                data['BLEU_model_a_vs_model_b'] = 'model_a'
            elif BLEU_score_model_a < BLEU_score_model_b:
                data['BLEU_model_a_vs_model_b'] = 'model_b'
            else:
                data['BLEU_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
            
def get_unieval_result():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    task = 'fact'
    evaluator = get_evaluator(task)
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        hyps_human_a = []
        hyps_human_b = []
        hyps_model_a = []
        hyps_model_b = []
        refs = []
        for data in dataset:
            refs.append(data['Reference'])
            hyp_human_a = data['refined_student_answer_a']
            hyp_human_b = data['refined_student_answer_b']
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyps_human_a.append(hyp_human_a)
            hyps_human_b.append(hyp_human_b)
            hyps_model_a.append(hyp_model_a)
            hyps_model_b.append(hyp_model_b)
        data_human_a = convert_to_json(output_list=hyps_human_a, src_list=refs)
        scores_human_a = evaluator.evaluate(data_human_a)
        data_human_b = convert_to_json(output_list=hyps_human_b, src_list=refs)
        scores_human_b = evaluator.evaluate(data_human_b)
        data_model_a = convert_to_json(output_list=hyps_model_a, src_list=refs)
        scores_model_a = evaluator.evaluate(data_model_a)
        data_model_b = convert_to_json(output_list=hyps_model_b, src_list=refs)
        scores_model_b = evaluator.evaluate(data_model_b)
        for i, data in enumerate(dataset):
            data['UNIEVAL_score_human_a'] = scores_human_a[i]['consistency']
            data['UNIEVAL_score_human_b'] = scores_human_b[i]['consistency']
            data['UNIEVAL_score_model_a'] = scores_model_a[i]['consistency']
            data['UNIEVAL_score_model_b'] = scores_model_b[i]['consistency']
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        hyps_model_a = []
        hyps_model_b = []
        refs = []
        for data in dataset:
            refs.append(data['Reference'])
            hyp_model_a = data['model_answer_a']
            hyp_model_b = data['model_answer_b']
            hyps_model_a.append(hyp_model_a)
            hyps_model_b.append(hyp_model_b)
        data_model_a = convert_to_json(output_list=hyps_model_a, src_list=refs)
        scores_model_a = evaluator.evaluate(data_model_a)
        data_model_b = convert_to_json(output_list=hyps_model_b, src_list=refs)
        scores_model_b = evaluator.evaluate(data_model_b)
        for i, data in enumerate(dataset):
            data['UNIEVAL_score_model_a'] = scores_model_a[i]['consistency']
            data['UNIEVAL_score_model_b'] = scores_model_b[i]['consistency']
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
            
def generate_bart_score_compare():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            BART_score_human_a = data['BART_score_human_a']
            BART_score_human_b = data['BART_score_human_b']
            BART_score_model_a = data['BART_score_model_a']
            BART_score_model_b = data['BART_score_model_b']
            if BART_score_human_a > BART_score_human_b:
                data['BART_human_a_vs_human_b'] = 'human_a'
            elif BART_score_human_a < BART_score_human_b:
                data['BART_human_a_vs_human_b'] = 'human_b'
            else:
                data['BART_human_a_vs_human_b'] = 'tie'
            if BART_score_human_a > BART_score_model_a:
                data['BART_human_a_vs_model_a'] = 'human_a'
            elif BART_score_human_a < BART_score_model_a:
                data['BART_human_a_vs_model_a'] = 'model_a'
            else:
                data['BART_human_a_vs_model_a'] = 'tie'
            if BART_score_human_a > BART_score_model_b:
                data['BART_human_a_vs_model_b'] = 'human_a'
            elif BART_score_human_a < BART_score_model_b:
                data['BART_human_a_vs_model_b'] = 'model_b'
            else:
                data['BART_human_a_vs_model_b'] = 'tie'
            if BART_score_model_a > BART_score_model_b:
                data['BART_model_a_vs_model_b'] = 'model_a'
            elif BART_score_model_a < BART_score_model_b:
                data['BART_model_a_vs_model_b'] = 'model_b'
            else:
                data['BART_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            BART_score_model_a = data['BART_score_model_a']
            BART_score_model_b = data['BART_score_model_b']
            if BART_score_model_a > BART_score_model_b:
                data['BART_model_a_vs_model_b'] = 'model_a'
            elif BART_score_model_a < BART_score_model_b:
                data['BART_model_a_vs_model_b'] = 'model_b'
            else:
                data['BART_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
                
def generate_bart_score_med_only():
    with open('../benchmark/english/psyen_llm.json', 'r') as f:
            dataset = json.loads(f.read())
    for data in dataset:
        BART_score_model_a = data['BART_score_model_a']
        BART_score_model_b = data['BART_score_model_b']
        if BART_score_model_a > BART_score_model_b:
            data['BART_model_a_vs_model_b'] = 'model_a'
        elif BART_score_model_a < BART_score_model_b:
            data['BART_model_a_vs_model_b'] = 'model_b'
        else:
            data['BART_model_a_vs_model_b'] = 'tie'
    with open('../benchmark/english/psyen_llm.json', 'w') as f:
        f.write(json.dumps(dataset, ensure_ascii=False))


def generate_bleurt_score_compare():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            BLEURT_score_human_a = data['BLEURT_score_human_a']
            BLEURT_score_human_b = data['BLEURT_score_human_b']
            BLEURT_score_model_a = data['BLEURT_score_model_a']
            BLEURT_score_model_b = data['BLEURT_score_model_b']
            if BLEURT_score_human_a > BLEURT_score_human_b:
                data['BLEURT_human_a_vs_human_b'] = 'human_a'
            elif BLEURT_score_human_a < BLEURT_score_human_b:
                data['BLEURT_human_a_vs_human_b'] = 'human_b'
            else:
                data['BLEURT_human_a_vs_human_b'] = 'tie'
            if BLEURT_score_human_a > BLEURT_score_model_a:
                data['BLEURT_human_a_vs_model_a'] = 'human_a'
            elif BLEURT_score_human_a < BLEURT_score_model_a:
                data['BLEURT_human_a_vs_model_a'] = 'model_a'
            else:
                data['BLEURT_human_a_vs_model_a'] = 'tie'
            if BLEURT_score_human_a > BLEURT_score_model_b:
                data['BLEURT_human_a_vs_model_b'] = 'human_a'
            elif BLEURT_score_human_a < BLEURT_score_model_b:
                data['BLEURT_human_a_vs_model_b'] = 'model_b'
            else:
                data['BLEURT_human_a_vs_model_b'] = 'tie'
            if BLEURT_score_model_a > BLEURT_score_model_b:
                data['BLEURT_model_a_vs_model_b'] = 'model_a'
            elif BLEURT_score_model_a < BLEURT_score_model_b:
                data['BLEURT_model_a_vs_model_b'] = 'model_b'
            else:
                data['BLEURT_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            BLEURT_score_model_a = data['BLEURT_score_model_a']
            BLEURT_score_model_b = data['BLEURT_score_model_b']
            if BLEURT_score_model_a > BLEURT_score_model_b:
                data['BLEURT_model_a_vs_model_b'] = 'model_a'
            elif BLEURT_score_model_a < BLEURT_score_model_b:
                data['BLEURT_model_a_vs_model_b'] = 'model_b'
            else:
                data['BLEURT_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))

def generate_unieval_score_compare():
    FILEs = ['geoen.json', 'polen.json', 'hisen.json', 'meden.json', 'psyen.json', 'lawen.json']
    base = '../benchmark/english/'
    for file in FILEs[:3]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            UNIEVAL_score_human_a = data['UNIEVAL_score_human_a']
            UNIEVAL_score_human_b = data['UNIEVAL_score_human_b']
            UNIEVAL_score_model_a = data['UNIEVAL_score_model_a']
            UNIEVAL_score_model_b = data['UNIEVAL_score_model_b']
            if UNIEVAL_score_human_a > UNIEVAL_score_human_b:
                data['UNIEVAL_human_a_vs_human_b'] = 'human_a'
            elif UNIEVAL_score_human_a < UNIEVAL_score_human_b:
                data['UNIEVAL_human_a_vs_human_b'] = 'human_b'
            else:
                data['UNIEVAL_human_a_vs_human_b'] = 'tie'
            if UNIEVAL_score_human_a > UNIEVAL_score_model_a:
                data['UNIEVAL_human_a_vs_model_a'] = 'human_a'
            elif UNIEVAL_score_human_a < UNIEVAL_score_model_a:
                data['UNIEVAL_human_a_vs_model_a'] = 'model_a'
            else:
                data['UNIEVAL_human_a_vs_model_a'] = 'tie'
            if UNIEVAL_score_human_a > UNIEVAL_score_model_b:
                data['UNIEVAL_human_a_vs_model_b'] = 'human_a'
            elif UNIEVAL_score_human_a < UNIEVAL_score_model_b:
                data['UNIEVAL_human_a_vs_model_b'] = 'model_b'
            else:
                data['UNIEVAL_human_a_vs_model_b'] = 'tie'
            if UNIEVAL_score_model_a > UNIEVAL_score_model_b:
                data['UNIEVAL_model_a_vs_model_b'] = 'model_a'
            elif UNIEVAL_score_model_a < UNIEVAL_score_model_b:
                data['UNIEVAL_model_a_vs_model_b'] = 'model_b'
            else:
                data['UNIEVAL_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    for file in FILEs[3:]:
        with open(base + file, 'r') as f:
            dataset = json.loads(f.read())
        for data in dataset:
            UNIEVAL_score_model_a = data['UNIEVAL_score_model_a']
            UNIEVAL_score_model_b = data['UNIEVAL_score_model_b']
            if UNIEVAL_score_model_a > UNIEVAL_score_model_b:
                data['UNIEVAL_model_a_vs_model_b'] = 'model_a'
            elif UNIEVAL_score_model_a < UNIEVAL_score_model_b:
                data['UNIEVAL_model_a_vs_model_b'] = 'model_b'
            else:
                data['UNIEVAL_model_a_vs_model_b'] = 'tie'
        with open(base + file, 'w') as f:
            f.write(json.dumps(dataset, ensure_ascii=False))
    
def convert_auto_j(input_path):
    with open(input_path, 'r') as f:
        dataset = json.loads(f.read())
    if 'geo' in input_path or 'pol' in input_path or 'his' in input_path:
        for data in dataset:
            if data['auto_J_stud_a_vs_stud_b'] == 0:
                data['autoJ_human_a_vs_human_b'] = 'human_a'
            elif data['auto_J_stud_a_vs_stud_b'] == 1:
                data['autoJ_human_a_vs_human_b'] = 'human_b'
            else:
                data['autoJ_human_a_vs_human_b'] = 'tie'
            if data['auto_J_stud_a_vs_model_a'] == 0:
                data['autoJ_human_a_vs_model_a'] = 'human_a'
            elif data['auto_J_stud_a_vs_model_a'] == 1:
                data['autoJ_human_a_vs_model_a'] = 'model_a'
            else:
                data['autoJ_human_a_vs_model_a'] = 'tie'
            if data['auto_J_stud_a_vs_model_b'] == 0:
                data['autoJ_human_a_vs_model_b'] = 'human_a'
            elif data['auto_J_stud_a_vs_model_b'] == 1:
                data['autoJ_human_a_vs_model_b'] = 'model_b'
            else:
                data['autoJ_human_a_vs_model_b'] = 'tie'
            if data['auto_J_model_a_vs_model_b'] == 0:
                data['autoJ_model_a_vs_model_b'] = 'model_a'
            elif data['auto_J_model_a_vs_model_b'] == 1:
                data['autoJ_model_a_vs_model_b'] = 'model_b'
            else:
                data['autoJ_model_a_vs_model_b'] = 'tie'
    else:
        for data in dataset:
            if data['auto_J_model_a_vs_model_b'] == 0:
                data['autoJ_model_a_vs_model_b'] = 'model_a'
            elif data['auto_J_model_a_vs_model_b'] == 1:
                data['autoJ_model_a_vs_model_b'] = 'model_b'
            else:
                data['autoJ_model_a_vs_model_b'] = 'tie'
    with open(input_path, 'w') as f:
        f.write(json.dumps(dataset, ensure_ascii=False, indent=4))

def convert_critique(input_path):
    with open(input_path, 'r') as f:
        dataset = json.loads(f.read())
    if 'geo' in input_path or 'pol' in input_path or 'his' in input_path:
        print(input_path)
        for data in dataset:
            if data['Critique_stud_a_vs_stud_b'] == 0:
                data['Critique_human_a_vs_human_b'] = 'human_a'
            elif data['Critique_stud_a_vs_stud_b'] == 1:
                data['Critique_human_a_vs_human_b'] = 'human_b'
            else:
                data['Critique_human_a_vs_human_b'] = 'tie'
            if data['Critique_stud_a_vs_model_a'] == 0:
                data['Critique_human_a_vs_model_a'] = 'human_a'
            elif data['Critique_stud_a_vs_model_a'] == 1:
                data['Critique_human_a_vs_model_a'] = 'model_a'
            else:
                data['Critique_human_a_vs_model_a'] = 'tie'
            if data['Critique_stud_a_vs_model_b'] == 0:
                data['Critique_human_a_vs_model_b'] = 'human_a'
            elif data['Critique_stud_a_vs_model_b'] == 1:
                data['Critique_human_a_vs_model_b'] = 'model_b'
            else:
                data['Critique_human_a_vs_model_b'] = 'tie'
            if data['Critique_model_a_vs_model_b'] == 0:
                data['Critique_model_a_vs_model_b'] = 'model_a'
            elif data['Critique_model_a_vs_model_b'] == 1:
                data['Critique_model_a_vs_model_b'] = 'model_b'
            else:
                data['Critique_model_a_vs_model_b'] = 'tie'
    else:
        for data in dataset:
            if data['Critique_model_a_vs_model_b'] == 0:
                data['Critique_model_a_vs_model_b'] = 'model_a'
            elif data['Critique_model_a_vs_model_b'] == 1:
                data['Critique_model_a_vs_model_b'] = 'model_b'
            else:
                data['Critique_model_a_vs_model_b'] = 'tie'
    with open(input_path, 'w') as f:
        f.write(json.dumps(dataset, ensure_ascii=False, indent=4))
convert_critique('../benchmark/english/hisen_llm.json')
        
def convert_critique_one_file():
    with open('../benchmark/english/hisen_llm_baseline.json', 'r') as f:
        dataset = json.loads(f.read())
    with open('../benchmark/english/hisen_llm_baseline.json', 'r') as f:
        target = json.loads(f.read())
    for data, tar in zip(dataset, target):
        if 'Critique_model_a_vs_model_a' not in data:
            continue
        if data['Critique_model_a_vs_model_a'] == 0:
            tar['Critique_model_a_vs_model_b'] = 'model_a'
        elif data['Critique_model_a_vs_model_a'] == 1:
            tar['Critique_model_a_vs_model_b'] = 'model_b'
        else:
            tar['Critique_model_a_vs_model_b'] = 'tie'
    with open('../benchmark/english/hisen_llm_baseline.json', 'w') as f:
        f.write(json.dumps(target, ensure_ascii=False, indent=4))
        
convert_critique_one_file()
def compare_human_with_baselines(input_path):
    autoj_human_a_vs_human_b = []
    autoj_human_a_vs_model_a = []
    autoj_human_a_vs_model_b = []
    autoj_model_a_vs_model_b = []
    critique_human_a_vs_human_b = []
    critique_human_a_vs_model_a = []
    critique_human_a_vs_model_b = []
    critique_model_a_vs_model_b = []
    gpt2_human_a_vs_human_b = []
    gpt2_human_a_vs_model_a = []
    gpt2_human_a_vs_model_b = []
    gpt2_model_a_vs_model_b = []
    tiger_human_a_vs_human_b = []
    tiger_human_a_vs_model_a = []
    tiger_human_a_vs_model_b = []
    tiger_model_a_vs_model_b = []
    bert_human_a_vs_human_b = []
    bert_human_a_vs_model_a = []
    bert_human_a_vs_model_b = []
    bert_model_a_vs_model_b = []
    rouge_human_a_vs_human_b = []
    rouge_human_a_vs_model_a = []
    rouge_human_a_vs_model_b = []
    rouge_model_a_vs_model_b = []
    bleu_human_a_vs_human_b = []
    bleu_human_a_vs_model_a = []
    bleu_human_a_vs_model_b = []
    bleu_model_a_vs_model_b = []
    bart_human_a_vs_human_b = []
    bart_human_a_vs_model_a = []
    bart_human_a_vs_model_b = []
    bart_model_a_vs_model_b = []
    bleurt_human_a_vs_human_b = []
    bleurt_human_a_vs_model_a = []
    bleurt_human_a_vs_model_b = []
    bleurt_model_a_vs_model_b = []
    unieval_human_a_vs_human_b = []
    unieval_human_a_vs_model_a = []
    unieval_human_a_vs_model_b = []
    unieval_model_a_vs_model_b = []
    annotatora_human_a_vs_human_b = []
    annotatora_human_a_vs_model_a = []
    annotatora_human_a_vs_model_b = []
    annotatora_model_a_vs_model_b = []
    annotatorb_human_a_vs_human_b = []
    annotatorb_human_a_vs_model_a = []
    annotatorb_human_a_vs_model_b = []
    annotatorb_model_a_vs_model_b = []
    with open(input_path, 'r') as f:
        dataset = json.loads(f.read())
    for data in dataset:
        autoj_human_a_vs_human_b.append(data['autoJ_human_a_vs_human_b'])
        autoj_human_a_vs_model_a.append(data['autoJ_human_a_vs_model_a'])
        autoj_human_a_vs_model_b.append(data['autoJ_human_a_vs_model_b'])
        autoj_model_a_vs_model_b.append(data['autoJ_model_a_vs_model_b'])
        critique_human_a_vs_human_b.append(data['Critique_human_a_vs_human_b'])
        critique_human_a_vs_model_a.append(data['Critique_human_a_vs_model_a'])
        critique_human_a_vs_model_b.append(data['Critique_human_a_vs_model_b'])
        critique_model_a_vs_model_b.append(data['Critique_model_a_vs_model_b'])
        gpt2_human_a_vs_human_b.append(data['GPT2_human_a_vs_human_b'])
        gpt2_human_a_vs_model_a.append(data['GPT2_human_a_vs_model_a'])
        gpt2_human_a_vs_model_b.append(data['GPT2_human_a_vs_model_b'])
        gpt2_model_a_vs_model_b.append(data['GPT2_model_a_vs_model_b'])
        tiger_human_a_vs_human_b.append(data['TIGER_human_a_vs_human_b'])
        tiger_human_a_vs_model_a.append(data['TIGER_human_a_vs_model_a'])
        tiger_human_a_vs_model_b.append(data['TIGER_human_a_vs_model_b'])
        tiger_model_a_vs_model_b.append(data['TIGER_model_a_vs_model_b'])
        bert_human_a_vs_human_b.append(data['BERT_human_a_vs_human_b'] if data['BERT_human_a_vs_human_b'] != 'tie' else data['BERT_human_a_vs_human_b'])
        bert_human_a_vs_model_a.append(data['BERT_human_a_vs_model_a'] if data['BERT_human_a_vs_model_a'] != 'tie' else data['BERT_human_a_vs_model_a'])
        bert_human_a_vs_model_b.append(data['BERT_human_a_vs_model_b'] if data['BERT_human_a_vs_model_b'] != 'tie' else data['BERT_human_a_vs_model_b'])
        bert_model_a_vs_model_b.append(data['BERT_model_a_vs_model_b'] if data['BERT_model_a_vs_model_b'] != 'tie' else data['BERT_model_a_vs_model_b'])
        rouge_human_a_vs_human_b.append(data['ROUGE_human_a_vs_human_b'])
        rouge_human_a_vs_model_a.append(data['ROUGE_human_a_vs_model_a'])
        rouge_human_a_vs_model_b.append(data['ROUGE_human_a_vs_model_b'])
        rouge_model_a_vs_model_b.append(data['ROUGE_model_a_vs_model_b'])
        bleu_human_a_vs_human_b.append(data['BLEU_human_a_vs_human_b'])
        bleu_human_a_vs_model_a.append(data['BLEU_human_a_vs_model_a'])
        bleu_human_a_vs_model_b.append(data['BLEU_human_a_vs_model_b'])
        bleu_model_a_vs_model_b.append(data['BLEU_model_a_vs_model_b'])
        bart_human_a_vs_human_b.append(data['BART_human_a_vs_human_b'])
        bart_human_a_vs_model_a.append(data['BART_human_a_vs_model_a'])
        bart_human_a_vs_model_b.append(data['BART_human_a_vs_model_b'])
        bart_model_a_vs_model_b.append(data['BART_model_a_vs_model_b'])
        bleurt_human_a_vs_human_b.append(data['BLEURT_human_a_vs_human_b'])
        bleurt_human_a_vs_model_a.append(data['BLEURT_human_a_vs_model_a'])
        bleurt_human_a_vs_model_b.append(data['BLEURT_human_a_vs_model_b'])
        bleurt_model_a_vs_model_b.append(data['BLEURT_model_a_vs_model_b'])
        unieval_human_a_vs_human_b.append(data['UNIEVAL_human_a_vs_human_b'])
        unieval_human_a_vs_model_a.append(data['UNIEVAL_human_a_vs_model_a'])
        unieval_human_a_vs_model_b.append(data['UNIEVAL_human_a_vs_model_b'])
        unieval_model_a_vs_model_b.append(data['UNIEVAL_model_a_vs_model_b'])
        annotatora_human_a_vs_human_b.append(data['decided_1_human_a_vs_human_b']) 
        annotatora_human_a_vs_model_a.append(data['decided_1_human_a_vs_model_a'])
        annotatora_human_a_vs_model_b.append(data['decided_1_human_a_vs_model_b'])
        annotatora_model_a_vs_model_b.append(data['decided_1_model_a_vs_model_b'])
        annotatorb_human_a_vs_human_b.append(data['decided_2_human_a_vs_human_b'])
        annotatorb_human_a_vs_model_a.append(data['decided_2_human_a_vs_model_a'])
        annotatorb_human_a_vs_model_b.append(data['decided_2_human_a_vs_model_b'])
        annotatorb_model_a_vs_model_b.append(data['decided_2_model_a_vs_model_b'])
    # Agreement of human_a vs baselines
    annotator_a_with_autoj_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, autoj_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)    
    annotator_a_with_autoj_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, autoj_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_autoj_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, autoj_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_autoj_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, autoj_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_critique_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, critique_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_critique_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, critique_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_critique_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, critique_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_critique_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, critique_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_gpt2_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, gpt2_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_gpt2_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, gpt2_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_gpt2_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, gpt2_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_gpt2_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, gpt2_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_tiger_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, tiger_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_tiger_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, tiger_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_tiger_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, tiger_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_tiger_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, tiger_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_bert_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, bert_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_bert_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, bert_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_bert_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, bert_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_bert_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, bert_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_rouge_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, rouge_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_rouge_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, rouge_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_rouge_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, rouge_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_rouge_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, rouge_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_bleu_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, bleu_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_bleu_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, bleu_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_bleu_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, bleu_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_bleu_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, bleu_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_bart_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, bart_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_bart_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, bart_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_bart_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, bart_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_bart_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, bart_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_bleurt_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, bleurt_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_bleurt_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, bleurt_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_bleurt_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, bleurt_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_bleurt_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, bleurt_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    annotator_a_with_unieval_human_a_vs_human_b = sum([1 for i, j in zip(annotatora_human_a_vs_human_b, unieval_human_a_vs_human_b) if i == j]) / len(annotatora_human_a_vs_human_b)
    annotator_a_with_unieval_human_a_vs_model_a = sum([1 for i, j in zip(annotatora_human_a_vs_model_a, unieval_human_a_vs_model_a) if i == j]) / len(annotatora_human_a_vs_model_a)
    annotator_a_with_unieval_human_a_vs_model_b = sum([1 for i, j in zip(annotatora_human_a_vs_model_b, unieval_human_a_vs_model_b) if i == j]) / len(annotatora_human_a_vs_model_b)
    annotator_a_with_unieval_model_a_vs_model_b = sum([1 for i, j in zip(annotatora_model_a_vs_model_b, unieval_model_a_vs_model_b) if i == j]) / len(annotatora_model_a_vs_model_b)
    print('Annotator A with AutoJ')
    print('Annotator A with AutoJ Human vs Human:', annotator_a_with_autoj_human_a_vs_human_b)
    print('Annotator A with AutoJ Human vs Model:', (annotator_a_with_autoj_human_a_vs_model_a + annotator_a_with_autoj_human_a_vs_model_b) / 2)
    print('Annotator A with AutoJ Model vs Model:', annotator_a_with_autoj_model_a_vs_model_b)
    print('Annotation A with Critique')
    print('Annotator A with Critique Human vs Human:', annotator_a_with_critique_human_a_vs_human_b)
    print('Annotator A with Critique Human vs Model:', (annotator_a_with_critique_human_a_vs_model_a + annotator_a_with_critique_human_a_vs_model_b) / 2)
    print('Annotator A with Critique Model vs Model:', annotator_a_with_critique_model_a_vs_model_b)
    print('Annotator A with GPT2')
    print('Annotator A with GPT2 Human vs Human:', annotator_a_with_gpt2_human_a_vs_human_b)
    print('Annotator A with GPT2 Human vs Model:', (annotator_a_with_gpt2_human_a_vs_model_a + annotator_a_with_gpt2_human_a_vs_model_b) / 2)
    print('Annotator A with GPT2 Model vs Model:', annotator_a_with_gpt2_model_a_vs_model_b)
    print('Annotator A with TIGER')
    print('Annotator A with TIGER Human vs Human:', annotator_a_with_tiger_human_a_vs_human_b)
    print('Annotator A with TIGER Human vs Model:', (annotator_a_with_tiger_human_a_vs_model_a + annotator_a_with_tiger_human_a_vs_model_b) / 2)
    print('Annotator A with TIGER Model vs Model:', annotator_a_with_tiger_model_a_vs_model_b)
    print('Annotator A with BERT')
    print('Annotator A with BERT Human vs Human:', annotator_a_with_bert_human_a_vs_human_b)
    print('Annotator A with BERT Human vs Model:', (annotator_a_with_bert_human_a_vs_model_a + annotator_a_with_bert_human_a_vs_model_b) / 2)
    print('Annotator A with BERT Model vs Model:', annotator_a_with_bert_model_a_vs_model_b)
    print('Annotator A with ROUGE')
    print('Annotator A with ROUGE Human vs Human:', annotator_a_with_rouge_human_a_vs_human_b)
    print('Annotator A with ROUGE Human vs Model:', (annotator_a_with_rouge_human_a_vs_model_a + annotator_a_with_rouge_human_a_vs_model_b) / 2)
    print('Annotator A with ROUGE Model vs Model:', annotator_a_with_rouge_model_a_vs_model_b)
    print('Annotator A with BLEU')
    print('Annotator A with BLEU Human vs Human:', annotator_a_with_bleu_human_a_vs_human_b)
    print('Annotator A with BLEU Human vs Model:', (annotator_a_with_bleu_human_a_vs_model_a + annotator_a_with_bleu_human_a_vs_model_b) / 2)
    print('Annotator A with BLEU Model vs Model:', annotator_a_with_bleu_model_a_vs_model_b)
    print('Annotator A with BART')
    print('Annotator A with BART Human vs Human:', annotator_a_with_bart_human_a_vs_human_b)
    print('Annotator A with BART Human vs Model:', (annotator_a_with_bart_human_a_vs_model_a + annotator_a_with_bart_human_a_vs_model_b) / 2)
    print('Annotator A with BART Model vs Model:', annotator_a_with_bart_model_a_vs_model_b)
    print('Annotator A with BLEURT')
    print('Annotator A with BLEURT Human vs Human:', annotator_a_with_bleurt_human_a_vs_human_b)
    print('Annotator A with BLEURT Human vs Model:', (annotator_a_with_bleurt_human_a_vs_model_a + annotator_a_with_bleurt_human_a_vs_model_b) / 2)
    print('Annotator A with BLEURT Model vs Model:', annotator_a_with_bleurt_model_a_vs_model_b)
    print('Annotator A with UNIEVAL')
    print('Annotator A with UNIEVAL Human vs Human:', annotator_a_with_unieval_human_a_vs_human_b)
    print('Annotator A with UNIEVAL Human vs Model:', (annotator_a_with_unieval_human_a_vs_model_a + annotator_a_with_unieval_human_a_vs_model_b) / 2)
    print('Annotator A with UNIEVAL Model vs Model:', annotator_a_with_unieval_model_a_vs_model_b)


def query(input_path, output_file):
    with open(input_path, 'r') as f:
        context_generator = json.loads(f.read())
    if 'geo' in input_path or 'pol' in input_path or 'his' in input_path:
        modes = ['human_a_vs_human_b', 'human_a_vs_model_a', 'human_a_vs_model_b', 'model_a_vs_model_b']
        modes2answer = {'human_a_vs_human_b': ['refined_student_answer_a', 'refined_student_answer_b'], 'human_a_vs_model_a': ['refined_student_answer_a', 'model_answer_a'], 'human_a_vs_model_b': ['refined_student_answer_a', 'model_answer_b'], 'model_a_vs_model_b': ['model_answer_a', 'model_answer_b']}
        modes2respond = {'human_a_vs_human_b': ['human_a', 'human_b'], 'human_a_vs_model_a': ['human_a', 'model_a'], 'human_a_vs_model_b': ['human_a', 'model_b'], 'model_a_vs_model_b': ['model_a', 'model_b']}
        for context_item in tqdm(context_generator):
            for mode in modes:
                key1, key2 = modes2answer[mode]
                res1, res2 = modes2respond[mode]
                system_message = context_item
                mes = f'''
    Reference:
    ```
    {system_message['Reference']}
    ```
    Answer_A:
    ```
    {system_message[key1]}
    ```
    Answer_B:
    ```
    {system_message[key2]}
    ```
                '''
                final_contentpart1 = f'''
    Assume you are a teacher. Next, I will provide a paragraph of text containing Reference, Answer_A, and Answer_B. 
    You should decide which answer is better based on the Reference.
        '''
                final_contentpart2 = f'''
    If you think Answer_A is better, please type {res1}. If you think Answer_B is better, please type {res2}.
    Otherwise, please type "tie". Return with ONLY {res1} or {res2} or 'tie'.
    '''
                final_parts = [final_contentpart1, f"{mes}\n{final_contentpart2}"]
                final_parts = '\n'.join(final_parts)

                final_result = send_request(final_parts, 'gpt-3.5-turbo', temperature=1.0)  
                mode = 'ChatGPT_' + mode
                context_item[mode] = final_result
            with open(output_file, 'w') as f:
                f.write(json.dumps(context_generator, ensure_ascii=False, indent=4))
    else:
        modes = ['model_a_vs_model_b']
        modes2answer = {'model_a_vs_model_b': ['model_answer_a', 'model_answer_b']}
        modes2respond = {'model_a_vs_model_b': ['model_a', 'model_b']}
        for context_item in tqdm(context_generator):
            for mode in modes:
                key1, key2 = modes2answer[mode]
                res1, res2 = modes2respond[mode]
                system_message = context_item
                mes = f'''
    Reference:
    {system_message['Reference']}
    Answer_A:
    {system_message[key1]}
    Answer_B:
    {system_message[key2]}
                '''
                final_contentpart1 = f'''
    Assume you are a teacher. Next, I will provide a paragraph of text containing Reference, Answer_A, and Answer_B.
    You should decide which answer is better based on the Reference.
        '''
                final_contentpart2 = f'''
    If you think Answer_A is better, please type {res1}. If you think Answer_B is better, please type {res2}.
    Otherwise, please type "tie". Return with ONLY {res1} or {res2} or 'tie'.
    '''
                final_parts = [final_contentpart1, f"{mes}\n{final_contentpart2}"]
                final_parts = '\n'.join(final_parts)
                final_result = send_request(final_parts, 'gpt-4o', temperature=1.0)  
                mode = 'GPT-4_' + mode
                context_item[mode] = final_result
            with open(output_file, 'w') as f:
                f.write(json.dumps(context_generator, ensure_ascii=False, indent=4))

def query_new(input_path, output_file, model):
    with open(input_path, 'r') as f:
        dataset = json.loads(f.read())
    for data in tqdm(dataset):
        system_message = data
        mes = f'''
Reference:
{system_message['reference']}
Answer_A:
{system_message['student_answer_a']}
Answer_B:
{system_message['student_answer_b']}
        '''
        final_contentpart1 = f'''
Assume you are a teacher. Next, I will provide a paragraph of text containing Reference, Answer_A, and Answer_B.
You should decide which answer is better based on the Reference.
    '''
        final_contentpart2 = f'''
If you think Answer_A is better, please type a. If you think Answer_B is better, please type b. Otherwise, please type "tie". You should think step by step before making a decision. Return with ``My choice is a`` or ``My choice is b`` or ``My choice is tie``.''
'''
        final_parts = [final_contentpart1, f"{mes}\n{final_contentpart2}"]
        final_parts = '\n'.join(final_parts)
        final_result = send_request(final_parts, model, temperature=1.0)  
        preference = final_result[final_result.find('My choice is ') + len('My choice is '):]
        if 'a' in preference:
            preference = 'a'
        elif 'b' in preference:
            preference = 'b'
        else:
            preference = 'tie'
        data[f'''{model}_preference'''] = preference
    with open(output_file, 'w') as f:
        f.write(json.dumps(dataset, ensure_ascii=False, indent=4))
        
def g_eval(input_path, output_file, model):
    with open(input_path, 'r') as f:
        dataset = json.loads(f.read())
    with open(output_file, 'r') as f:
        info = json.loads(f.read())
    for data in tqdm(dataset[len(info):]):
        system_message = data
        mes = f'''
Reference:
{system_message['reference']}
Answer_A:
{system_message['student_answer_a']}
Answer_B:
{system_message['student_answer_b']}
        '''
        final_contentpart1 = f'''
Assume you are a teacher. Next, I will provide a paragraph of text containing Reference, Answer_A, and Answer_B.
You should decide which answer is better based on the Reference.
    '''
        final_contentpart2 = f'''
If you think Answer_A is better, please type a. If you think Answer_B is better, please type b. Otherwise, please type "tie". Return with ONLY a or b or 'tie'.
'''
        final_parts = [final_contentpart1, f"{mes}\n{final_contentpart2}"]
        final_parts = '\n'.join(final_parts)
        votes = []
        for i in range(5):
            final_result = send_request_sb(final_parts, 'gpt-3.5-turbo', temperature=1.0)  
            print(final_result)
            votes.append(final_result)
        preference = Counter(votes).most_common(1)[0][0]
        if 'a' in preference:
            preference = 'a'
        elif 'b' in preference:
            preference = 'b'
        else:
            preference = 'tie'
        data[f'''{model}_Eval_preference'''] = preference
        data[f'''{model}_Eval_votes'''] = votes
        info.append(data)
        with open(output_file, 'w') as f:
            f.write(json.dumps(info, ensure_ascii=False, indent=4))


