
# gsm8k
def evaluate_gsm8k(data, model, tokenizer, batch_size=8):
    
    import re
    import torch

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print(f"Model is on device: {next(model.parameters()).device}")
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    gsm8k_data = []
    for sample in data:
        question = sample['question']
        answer = sample['answer']

        if "####" in answer:
            correct_answer = answer.split("####")[1].strip()
        else:
            correct_answer = answer.strip()

        gsm8k_data.append({
            "question": question,
            "correct_answer": correct_answer
        })

    
    def extract_answer(text):
    
        matches = re.findall(r'\d+(?:\.\d+)?', text)
        return matches[-1] if matches else ""

    
    model.eval()
    correct = 0
    total = 0

   
    prompt = ""

 
    for i in range(0, len(gsm8k_data), batch_size):
        batch = gsm8k_data[i:i + batch_size]
        questions = [sample["question"] + prompt for sample in batch]
        correct_answers = [sample["correct_answer"] for sample in batch]

        
        inputs = tokenizer(questions, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
        
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=512, pad_token_id=tokenizer.pad_token_id)

       
        generated_answers = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        output_file = "evaluation_results.txt"  

        with open(output_file, "w", encoding="utf-8") as f:
            for question, generated_answer, correct_answer in zip(questions, generated_answers, correct_answers):
                
                f.write(f"Question: {question}\n")
                f.write(f"Generated Answer: {generated_answer}\n")
                f.write(f"Correct Answer: {correct_answer}\n")
                f.write("-" * 50 + "\n")
        
                
                extracted_answer = extract_answer(generated_answer)

                
                try:
                    if float(extracted_answer) == float(correct_answer):
                        correct += 1
                except ValueError:
                    pass
                total += 1

    
    accuracy = correct / total if total > 0 else 0
    return accuracy

#humaneval
def evaluate_humaneval(data, model, tokenizer, k_values=[1], timeout=10):
    
    import multiprocessing
    from contextlib import contextmanager
    import os
    import shutil
    import signal
    import tempfile
    from tqdm import tqdm
    import torch
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    
    class TimeoutException(Exception):
        pass

    @contextmanager
    def time_limit(seconds):
        def signal_handler(signum, frame):
            raise TimeoutException("Timed out!")
        signal.signal(signal.SIGALRM, signal_handler)
        signal.alarm(seconds)
        try:
            yield
        finally:
            signal.alarm(0)

    @contextmanager
    def create_tempdir():
        tempdir = tempfile.mkdtemp()
        try:
            yield tempdir
        finally:
            shutil.rmtree(tempdir)

    def reliability_guard():
        del os.system
        del os.remove

    def filter_code(completion: str) -> str:
        lines = completion.split("\n")
        code_lines = []
        inside_code = False
        for line in lines:
            stripped_line = line.strip()
            if stripped_line.startswith("from ") or stripped_line.startswith("def "):
                inside_code = True
            if inside_code:
                code_lines.append(line)
            if stripped_line.startswith("return") and inside_code:
                break
        return "\n".join(code_lines)


    def check_correctness(problem, completion, timeout):
        def unsafe_execute(result):
            with create_tempdir():
                reliability_guard()
                filtered_completion = filter_code(completion)
                check_program = (
                    filtered_completion + "\n" +
                    problem["test_code"] + "\n" +
                    f"check({problem['entry_point']})"
                )
                
                try:
                    exec_globals = {}
                    with time_limit(timeout):
                        exec(check_program, exec_globals)
                    result.append("passed")
                except TimeoutException:
                    result.append("timed out")
                except BaseException as e:
                    result.append(f"failed: {e}")

        manager = multiprocessing.Manager()
        result = manager.list()
        p = multiprocessing.Process(target=unsafe_execute, args=(result,))
        p.start()
        p.join(timeout=timeout + 1)
        if p.is_alive():
            p.kill()
        if not result:
            result.append("timed out")
        return result[0] == "passed"

    def calculate_pass_at_k(results, k):
        n = len(results)
        pass_at_k = 0
        for result in results:
            m = len(result)
            correct = sum(result)
            if correct == 0:
                continue
            pass_at_k += 1 - (1 - correct / m) ** k
        pass_at_k /= n
        return pass_at_k

    model.eval()
    results = []

    for sample in tqdm(data, desc="Evaluating HumanEval"):
        prompt = sample["prompt"]
        test_code = sample["test"]
        entry_point = sample["entry_point"]

        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,
                num_return_sequences=10,
                num_beams=10,
                pad_token_id=tokenizer.pad_token_id
            )

        generated_solutions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

        result = []
        for generated_solution in generated_solutions:
            problem = {
                "prompt": prompt,
                "test_code": test_code,
                "entry_point": entry_point
            }
            passed = check_correctness(problem, generated_solution, timeout)
            result.append(passed)

        results.append(result)

    pass_at_k_scores = {}
    for k in k_values:
        pass_at_k_scores[f"pass@{k}"] = calculate_pass_at_k(results, k)

    return pass_at_k_scores

def evaluate_pubmedqa(data, model, tokenizer, batch_size=8):

    import torch
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    from tqdm import tqdm

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    pubmedqa_data = []
    for sample in data:
        instruction = sample['instruction']
        input_text = sample['input']
        output_text = sample['output']
        pubmedqa_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        })

  
    def calculate_bleu(reference, hypothesis):
        reference_tokens = [reference.split()]  
        hypothesis_tokens = hypothesis.split()
        smoothing_function = SmoothingFunction().method1 
        bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smoothing_function)
        return bleu_score

    model.eval()
    total_bleu = 0.0
    num_samples = len(pubmedqa_data)

    for i in tqdm(range(0, num_samples, batch_size), desc="Evaluating PubMedQA"):
        batch = pubmedqa_data[i:i + batch_size]
        instructions = [sample["instruction"] for sample in batch]
        inputs = [sample["input"] for sample in batch]
        references = [sample["output"] for sample in batch]

        prompts = [f"{instruction}\n{input_text}" for instruction, input_text in zip(instructions, inputs)]

        tokenized_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **tokenized_inputs,
                max_new_tokens=512,
                pad_token_id=tokenizer.pad_token_id
            )
        hypotheses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

        for prompt, reference, hypothesis in zip(prompts, references, hypotheses):
            bleu_score = calculate_bleu(reference, hypothesis)
            total_bleu += bleu_score


            print(f"Prompt: {prompt}")
            print(f"Reference: {reference}")
            print(f"Hypothesis: {hypothesis}")
            print(f"BLEU Score: {bleu_score}")
            print("-" * 50)

    average_bleu = total_bleu / num_samples
    return average_bleu

def evaluate_fiqa(data, model, tokenizer, batch_size=8):
   
    import torch
    from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
    from tqdm import tqdm

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    fiqa_data = []
    for sample in data:
        instruction = sample['instruction']
        input_text = sample['input']
        output_text = sample['output']
        fiqa_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        })

    def calculate_bleu(reference, hypothesis):
        reference_tokens = [reference.split()] 
        hypothesis_tokens = hypothesis.split()
        smoothing_function = SmoothingFunction().method1  
        bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smoothing_function)
        return bleu_score


    model.eval()
    total_bleu = 0.0
    num_samples = len(fiqa_data)

    for i in tqdm(range(0, num_samples, batch_size), desc="Evaluating fiqa"):
        batch = fiqa_data[i:i + batch_size]
        instructions = [sample["instruction"] for sample in batch]
        inputs = [sample["input"] for sample in batch]
        references = [sample["output"] for sample in batch]

     
        prompts = [f"{instruction}\n{input_text}" for instruction, input_text in zip(instructions, inputs)]

     
        tokenized_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)

        
        with torch.no_grad():
            outputs = model.generate(
                **tokenized_inputs,
                max_new_tokens=512,
                pad_token_id=tokenizer.pad_token_id
            )

    
        hypotheses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

       
        for prompt, reference, hypothesis in zip(prompts, references, hypotheses):
            bleu_score = calculate_bleu(reference, hypothesis)
            total_bleu += bleu_score

           
            print(f"Prompt: {prompt}")
            print(f"Reference: {reference}")
            print(f"Hypothesis: {hypothesis}")
            print(f"BLEU Score: {bleu_score}")
            print("-" * 50)

    
    average_bleu = total_bleu / num_samples
    return average_bleu

def evaluate_fiqa_with_bertscore(data, model, tokenizer, batch_size=8):
   
    from bert_score import score
    import torch
    from tqdm import tqdm
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    fiqa_data = []
    for sample in data:
        instruction = sample['instruction']
        input_text = sample['input']
        output_text = sample['output']
        fiqa_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        })


    model.eval()
    all_references = []
    all_hypotheses = []

    for i in tqdm(range(0, len(fiqa_data), batch_size), desc="Evaluating fiqa with BERTScore"):
        batch = fiqa_data[i:i + batch_size]
        instructions = [sample["instruction"] for sample in batch]
        inputs = [sample["input"] for sample in batch]
        references = [sample["output"] for sample in batch]

        prompts = [f"{instruction}\n{input_text}" for instruction, input_text in zip(instructions, inputs)]

        tokenized_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **tokenized_inputs,
                max_new_tokens=512,
                
                pad_token_id=tokenizer.pad_token_id
            )

        hypotheses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

        all_references.extend(references)
        all_hypotheses.extend(hypotheses)

    P, R, F1 = score(all_hypotheses, all_references, lang="en", verbose=True)

    for prompt, reference, hypothesis, f1_score in zip(prompts, all_references, all_hypotheses, F1):
        print(f"Prompt: {prompt}")
        print(f"Reference: {reference}")
        print(f"Hypothesis: {hypothesis}")
        print(f"BERTScore F1: {f1_score.item()}")
        print("-" * 50)

    average_bertscore = F1.mean().item()
    return average_bertscore

def evaluate_pubmedqa_with_bertscore(data, model, tokenizer, batch_size=8):
    from bert_score import score
    import torch
    from tqdm import tqdm
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    fiqa_data = []
    for sample in data:
        instruction = sample['instruction']
        input_text = sample['input']
        output_text = sample['output']
        fiqa_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        })

    model.eval()
    all_references = []
    all_hypotheses = []

    for i in tqdm(range(0, len(fiqa_data), batch_size), desc="Evaluating fiqa with BERTScore"):
        batch = fiqa_data[i:i + batch_size]
        instructions = [sample["instruction"] for sample in batch]
        inputs = [sample["input"] for sample in batch]
        references = [sample["output"] for sample in batch]

        prompts = [f"{instruction}\n{input_text}" for instruction, input_text in zip(instructions, inputs)]

        tokenized_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **tokenized_inputs,
                max_new_tokens=200,
                pad_token_id=tokenizer.pad_token_id
            )

        hypotheses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

        all_references.extend(references)
        all_hypotheses.extend(hypotheses)

    P, R, F1 = score(all_hypotheses, all_references, lang="en", verbose=True)

    for prompt, reference, hypothesis, f1_score in zip(prompts, all_references, all_hypotheses, F1):
        print(f"Prompt: {prompt}")
        print(f"Reference: {reference}")
        print(f"Hypothesis: {hypothesis}")
        print(f"BERTScore F1: {f1_score.item()}")
        print("-" * 50)

    average_bertscore = F1.mean().item()
    return average_bertscore

import requests
import json
from tqdm import tqdm

def evaluate_pubmedqa_judge_model(data, model, tokenizer, batch_size=16):
    import requests
    import json
    from tqdm import tqdm
    import torch
   
    api_url="model_url"
    api_key="your_api_key"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    pubmedqa_data = []
    for sample in data:
        instruction = sample['instruction']
        input_text = sample['input']
        output_text = sample['output']
        pubmedqa_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        })

    model.eval()
    correct = 0
    total = 0

    for i in tqdm(range(0, len(pubmedqa_data), batch_size), desc="Evaluating with Judge Model"):
        batch = pubmedqa_data[i:i + batch_size]
        instructions = [sample["instruction"] for sample in batch]
        inputs = [sample["input"] for sample in batch]
        references = [sample["output"] for sample in batch]
        

        prompts = [f"{instruction}\n{input_text}\nPlease answer with only 'Yes' or 'No'." for instruction, input_text in zip(instructions, inputs)]

    
        tokenized_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **tokenized_inputs,
                max_new_tokens=1024,
                temperature=1.0, 
                pad_token_id=tokenizer.pad_token_id
            )

        hypotheses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        from openai import OpenAI

        client = OpenAI(api_key=api_key, base_url=api_url)

 
        for reference, hypothesis, prompt in zip(references, hypotheses, prompts):
            if hypothesis.startswith(prompt):
                hypothesis = hypothesis[len(prompt):].strip()
            if len(hypothesis) == 0:
                hypothesis = "I don't know"
            content = (
                f"{prompt}\n"
                f"------------------------------------------------------\n"
                f"Reference answer: {reference}\n"
                f"------------------------------------------------------\n"
                f"Generated answer: {hypothesis}\n"
                f"------------------------------------------------------\n"
                f"You are a judge model. The generated answer is an output of a test model. The generated answer and the refernece answer both try to give only positive or negative answer to the question.You need to summarize them as 'yes' or 'no'. And then please determine whether the generated answer negates the reference answer. "
                f"Return 'No' if ANY of these conditions are met:\n"
                f"The Generated Answer explicitly contradicts or refutes the Reference Answer\n"
                f"The Generated Answer contains no substantive content or it didn't answer the question, please pay attention that the generated answer may just repeat the context and the question due to its limited perfomance.\n"
                f"The Generated Answer contains I don't know or similar uncertain statements\n"
                f"The generated answer is empty.\n"
                f"Return 'Yes' ONLY if ALL these conditions are met:\n"
                f"The Generated Answer does not contradict the Reference Answer\n"
                f"The Generated Answer contains substantive content,  return 'No'.\n"
                f"Please answer with only 'Yes' or 'No'.\n"
                
            )
            
      
            response = client.chat.completions.create(
                model="model_name",
                messages=[
                    {"role": "user", "content": content}
                ],
                max_tokens=1688,
                temperature=0.7,
                stream=False
            )
            
 
            answer = response.choices[0].message.content.strip()

            if "Yes" in answer:
                correct += 1
            total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy
    

def evaluate_fiqa_judge_model(data, model, tokenizer, batch_size=8):
    import requests
    import json
    from tqdm import tqdm
    import torch
    
    api_url="api_url"
    api_key="your_api_key"
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

   
    pubmedqa_data = []
    for sample in data:
        instruction = sample['instruction']
        input_text = sample['input']
        output_text = sample['output']
        pubmedqa_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": output_text
        })


    model.eval()
    correct = 0
    total = 0

    for i in tqdm(range(0, len(pubmedqa_data), batch_size), desc="Evaluating with Judge Model"):
        batch = pubmedqa_data[i:i + batch_size]
        instructions = [sample["instruction"] for sample in batch]
        inputs = [sample["input"] for sample in batch]
        references = [sample["output"] for sample in batch]
        
        prompts = [f"{instruction}\n{input_text}\nPlease answer with only 'Yes' or 'No'." for instruction, input_text in zip(instructions, inputs)]

        tokenized_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(device)


        with torch.no_grad():
            outputs = model.generate(
                **tokenized_inputs,
                max_new_tokens=1024,
                pad_token_id=tokenizer.pad_token_id
            )

    
        hypotheses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        
    
        for reference, hypothesis, prompt in zip(references, hypotheses, prompts):
            if hypothesis.startswith(prompt):
                hypothesis = hypothesis[len(prompt):].strip()
            content = (
                f"Prompt: {prompt}\n"
                f"Reference answer: {reference}\n"
                f"Generated answer: {hypothesis}\n"
                f"Please determine whether the generated answer negates the reference answer. "
                f"If the generated answer does not negate the reference answer, respond with 'Yes'. "
                f"If the generated answer negates the reference answer, respond with 'No'."
            )
            

            print(f"Content sent to judge_model:\n{content}\n{'-' * 50}")
            
            payload = json.dumps({
                "model": "claude-3-7-sonnet-20250219",
                "messages": [
                    {
                        "role": "user",
                        "content": content
                    }
                ],
                "max_tokens": 1688,
                "temperature": 0.5,
                "stream": False
            })
            headers = {
                'Authorization': f'Bearer {api_key}',
                'Content-Type': 'application/json'
            }

            response = requests.post(api_url, headers=headers, data=payload)
            if response.status_code == 200:
                result = response.json()
                answer = result["choices"][0]["message"]["content"].strip()
                print(f"Judge Model Response: {answer}")
                if answer == "Yes":
                    correct += 1
            total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy    
