from huggingface_hub import login
import torch
import os
import sys
import subprocess
import argparse
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from utils.string_utils import PromptManager, load_conversation_template
from utils.opt_utils import load_model_and_tokenizer, get_latest_commit_info
from utils.harmful_decoding import HarmfulDecoding,MyDecoder
from utils.ppl_calculator import PPL_Calculator
from utils.bpe import load_subword_nmt_table, BpeOnlineTokenizer
from utils.model import GPT
from safe_eval import DictJudge, GPTJudge
import numpy as np
from tqdm import tqdm
import copy, json, time, logging
import json
from datasets import load_dataset, concatenate_datasets
from safe_eval import DictJudge, GPTJudge, QwenJudge



file_path = "input.json"
save_path = "your_save_path.json"
with open(file_path, 'r', encoding='utf-8') as f:
    output_json = json.load(f)
results = output_json['data']
goals = [result['goal'] for result in results]
instructions = [result['instruction'] for result in results]
responses = [result['output'] for result in results]


gpt_judge = GPTJudge('gpt', api="your_api_key")
goals_responses_pairs = []
for i in range(len(instructions)):
    goals_responses_pairs.append((goals[i], responses[i]))

scores, reasons = gpt_judge.evaluate(goals_responses_pairs)
scores_gpt = gpt_judge.score_parse(scores)

dict_judge = DictJudge()
scores_dict = dict_judge.eval_batch(responses)

safe_eval_results = []

for i in range(len(instructions)):
    output_formatted = {
        "id": i,
        "goal": goals[i],
        "instruction": instructions[i],
        "output": responses[i],
        "dic_score": scores_dict[i],
        "GPT_score": scores_gpt[i],
        "GPT_reason": reasons[i]
    }
    print(output_formatted)
    safe_eval_results.append(output_formatted)

score_counts = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
valid_gpt_scores = []

for item in safe_eval_results:
    gpt_score = item.get('GPT_score') 
    if gpt_score is not None:
        if isinstance(gpt_score, (int, float)) and 1 <= gpt_score <= 5:
            int_score = int(gpt_score)
            score_counts[int_score] += 1
            valid_gpt_scores.append(gpt_score)

gpt_score_mean = np.mean(valid_gpt_scores) if valid_gpt_scores else 0

safe_eval_results.append({
    "result": {
        "GPT_score_mean": round(gpt_score_mean, 4), 
        "GPT_score_distribution": score_counts
    }
})

with open(save_path+'_gpt_eval.json', 'w') as f:
    json.dump(safe_eval_results, f, indent=4)
