#!/usr/bin/env python
# coding: utf-8
"""
Evaluation Script for Self-Instruct Dataset

This script evaluates trained models on the Self-Instruct dataset
using ROUGE metrics. It supports multiple evaluation runs with different
random seeds for robust performance assessment.

Usage:
    python inst_tuning_self_inst.py <model_name>
    
Args:
    model_name: Path to the trained model to evaluate

"""

import torch
import os
import pandas as pd
import sys
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from tqdm import tqdm
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import evaluate

# Device configuration
device = torch.device("cuda")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("MiniLLM/teacher-OPT-13B")

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")


def format_eval_self_inst(example):
    """
    Format evaluation examples for Self-Instruct dataset.
    
    Args:
        example: Dataset example containing instruction and instances
        
    Returns:
        Formatted prompt for evaluation
    """
    prompt = f"""Below is an instruction that describes a task.
        Write a response that appropriately completes the request.
        ### Instruction:
        {example["instruction"]}
        ### Input:
        {example["instances"]["input"]}
        ### Response:
        """
    return {"prompt": prompt}


def tokenize_eval_self_inst(example):
    """
    Tokenize evaluation examples for Self-Instruct dataset.
    
    Args:
        example: Formatted example with prompt and instances
        
    Returns:
        Tokenized example with labels
    """
    tokenized = tokenizer(
        example["prompt"], 
        truncation=True, 
        padding=True, 
        max_length=1024, 
        return_tensors="pt"
    )
    tokenized["labels"] = tokenizer(
        example["instances"]["output"][0], 
        truncation=True, 
        padding=True, 
        max_length=1024, 
        return_tensors="pt"
    )["input_ids"]
    return tokenized


# Load Self-Instruct dataset
ds = load_dataset("yizhongw/self_instruct", "human_eval")
ds_test = ds["train"].map(format_eval_self_inst)


def eval_rogue(model, eval_dataloader):
    """
    Evaluate model using ROUGE metrics.
    
    Args:
        model: Model to evaluate
        eval_dataloader: Evaluation dataloader
        
    Returns:
        ROUGE evaluation results
    """
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        model.eval()
        device = model.device
        print("------Now Evaluating----")
        
        for batch_text in tqdm(eval_dataloader):
            batch = tokenize_eval_self_inst(batch_text)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
    
            generated_ids = model.generate(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                do_sample=True, 
                max_new_tokens=512
            )
    
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
            decoded_preds = [pred.strip() for pred in decoded_preds]
            decoded_labels = [label.strip() for label in decoded_labels]
    
            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)
    
    rouge_result = rouge.compute(
        predictions=all_preds, 
        references=all_labels, 
        use_stemmer=True
    )
    return rouge_result


def main():
    """
    Main evaluation function.
    
    This function:
    1. Loads the specified model
    2. Runs evaluation with multiple random seeds
    3. Computes average performance across seeds
    """
    model_name = sys.argv[1]
    
    # List of models to evaluate (if multiple)
    models = [model_name]  # You can extend this list for multiple models
    
    for model_name in models:
        print(f"Evaluating model: {model_name}")
        res_list = []
        
        # Evaluate with multiple random seeds for robustness
        for seed in [10, 19, 42, 69, 99]:
            set_seed(seed)
            print(f"Running evaluation with seed {seed}")
            
            # Create evaluation dataloader
            eval_dl = DataLoader(ds_test, batch_size=32)
            
            # Load model
            model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
            
            # Run evaluation
            res = eval_rogue(model, eval_dl)
            print(f"Results for seed {seed}: {res}")
            res_list.append(res)
        
        # Compute average results across seeds
        avg_results = pd.DataFrame(res_list).mean()
        print(f"Average results across seeds: {avg_results}")


if __name__ == "__main__":
    main()




