#!/usr/bin/env python
# coding: utf-8
"""
Evaluation Script for Unnatural Instructions Dataset

This script evaluates trained models on the Unnatural Instructions dataset
using ROUGE metrics. It supports multiple evaluation runs with different
random seeds for robust performance assessment.

Usage:
    python inst_tuning_unnat.py <model_name>
    
Args:
    model_name: Path to the trained model to evaluate
"""

import torch
import os
import pandas as pd
import sys
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from tqdm import tqdm
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import evaluate

# Device configuration
device = torch.device("cuda")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("MiniLLM/teacher-OPT-13B")

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")


def format_eval_unnat(example):
    """
    Format evaluation examples for Unnatural Instructions dataset.
    
    Args:
        example: Dataset example containing instruction and instances
        
    Returns:
        Formatted prompt for evaluation
    """
    prompt = f"""Below is an instruction that describes a task.
        Write a response that appropriately completes the request.
        ### Instruction:
        {example["instruction"]}
        ### Input:
        {example["instances"][0]["input"]}
        {example["instances"][0]["constraints"]}
        ### Response:
        """
    return {"prompt": prompt, "output": example["instances"][0]["output"]}


def tokenize_eval_unnat(example):
    """
    Tokenize evaluation examples for Unnatural Instructions dataset.
    
    Args:
        example: Formatted example with prompt and output
        
    Returns:
        Tokenized example with labels
    """
    tokenized = tokenizer(
        example["prompt"], 
        truncation=True, 
        padding=True, 
        max_length=1024, 
        return_tensors="pt"
    )
    tokenized["labels"] = tokenizer(
        example["output"], 
        truncation=True, 
        padding=True, 
        max_length=1024, 
        return_tensors="pt"
    )["input_ids"]
    return tokenized


# Load Unnatural Instructions dataset
ds = load_dataset("mrm8488/unnatural-instructions-full")
ds_test = ds["train"].map(format_eval_unnat).remove_columns(ds["train"].column_names)
ds_test = ds_test.filter(lambda x: len(x["output"]) > 10)


def eval_rogue(model, eval_dataloader):
    """
    Evaluate model using ROUGE metrics.
    
    Args:
        model: Model to evaluate
        eval_dataloader: Evaluation dataloader
        
    Returns:
        ROUGE evaluation results
    """
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        model.eval()
        device = model.device
        print("------Now Evaluating----")
        
        for batch_text in tqdm(eval_dataloader):
            batch = tokenize_eval_unnat(batch_text)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
    
            generated_ids = model.generate(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                do_sample=True, 
                max_new_tokens=512
            )
    
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
            decoded_preds = [pred.strip() for pred in decoded_preds]
            decoded_labels = [label.strip() for label in decoded_labels]
    
            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)
    
    rouge_result = rouge.compute(
        predictions=all_preds, 
        references=all_labels, 
        use_stemmer=True
    )
    return rouge_result


def main():
    """
    Main evaluation function.
    
    This function:
    1. Loads the specified model
    2. Runs evaluation with multiple random seeds
    3. Computes average performance across seeds
    """
    model_name = sys.argv[1]
    
    # List of models to evaluate (if multiple)
    models = [model_name]  # You can extend this list for multiple models
    
    for model_name in models:
        print(f"Evaluating model: {model_name}")
        res_list = []
        
        # Evaluate with multiple random seeds for robustness
        for seed in [10, 19, 42, 69, 99]:
            set_seed(seed)
            print(f"Running evaluation with seed {seed}")
            
            # Create subset of dataset for evaluation
            ds_subset = ds_test.shuffle(seed=seed).select(range(5000))
            eval_dl = DataLoader(ds_subset, batch_size=32)
            
            # Load model
            model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
            
            # Run evaluation
            res = eval_rogue(model, eval_dl)
            print(f"Results for seed {seed}: {res}")
            res_list.append(res)
        
        # Compute average results across seeds
        avg_results = pd.DataFrame(res_list).mean()
        print(f"Average results across seeds: {avg_results}")


if __name__ == "__main__":
    main()




