#!/usr/bin/env python
# coding: utf-8
"""
Evaluation Script for Natural Instructions Dataset

This script evaluates trained models on the Natural Instructions dataset
using ROUGE metrics. It supports multiple evaluation runs with different
random seeds for robust performance assessment.

Usage:
    python inst_tuning_nat.py <model_name>
    
Args:
    model_name: Path to the trained model to evaluate

"""

import torch
import os
import pandas as pd
import sys
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset, Dataset
from tqdm import tqdm
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import evaluate

# Device configuration
device = torch.device("cuda")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("MiniLLM/teacher-OPT-13B")

# Load ROUGE evaluation metric
rouge = evaluate.load("rouge")


def format_eval_nat(example):
    """
    Format evaluation examples for Natural Instructions dataset.
    
    This function parses the prompt to extract instruction and input,
    then formats them according to the evaluation template.
    
    Args:
        example: Dataset example containing prompt and completion
        
    Returns:
        Formatted prompt for evaluation
    """
    import re
    
    # Parse prompt to extract instruction and input
    match = re.match(
        r'^(.*?)\s*Input:\s*(.*?)(?:\s*Output:\s*(.*))?$', 
        example["prompt"], 
        re.DOTALL
    )
    
    prompt = f"""Below is an instruction that describes a task.
        Write a response that appropriately completes the request.
        ### Instruction:
        {match.group(1).strip()}
        ### Input:
        {match.group(2).strip()}
        ### Response:
        """
    return {"prompt": prompt}


def tokenize_eval_nat(example):
    """
    Tokenize evaluation examples for Natural Instructions dataset.
    
    Args:
        example: Formatted example with prompt and completion
        
    Returns:
        Tokenized example with labels
    """
    tokenized = tokenizer(
        example["prompt"], 
        truncation=True, 
        padding=True, 
        max_length=1024, 
        return_tensors="pt"
    )
    tokenized["labels"] = tokenizer(
        example["completion"], 
        truncation=True, 
        padding=True, 
        max_length=1024, 
        return_tensors="pt"
    )["input_ids"]
    return tokenized


# Load Natural Instructions dataset
ds = load_dataset("yizhongw/self_instruct", "super_natural_instructions")
ds_test = ds["test"].map(format_eval_nat)
ds_test = ds_test.filter(lambda x: len(x["completion"]) > 10)


def eval_rogue(model, eval_dataloader):
    """
    Evaluate model using ROUGE metrics.
    
    Args:
        model: Model to evaluate
        eval_dataloader: Evaluation dataloader
        
    Returns:
        ROUGE evaluation results
    """
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        model.eval()
        device = model.device
        print("------Now Evaluating----")
        
        for batch_text in tqdm(eval_dataloader):
            batch = tokenize_eval_nat(batch_text)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
    
            generated_ids = model.generate(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                do_sample=True, 
                max_new_tokens=512
            )
    
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
            decoded_preds = [pred.strip() for pred in decoded_preds]
            decoded_labels = [label.strip() for label in decoded_labels]
    
            all_preds.extend(decoded_preds)
            all_labels.extend(decoded_labels)
    
    rouge_result = rouge.compute(
        predictions=all_preds, 
        references=all_labels, 
        use_stemmer=True
    )
    return rouge_result


def main():
    """
    Main evaluation function.
    
    This function:
    1. Loads the specified model
    2. Runs evaluation with multiple random seeds
    3. Computes average performance across seeds
    """
    model_name = sys.argv[1]
    
    # List of models to evaluate (if multiple)
    models = [model_name]  # You can extend this list for multiple models
    
    for model_name in models:
        print(f"Evaluating model: {model_name}")
        res_list = []
        
        # Evaluate with multiple random seeds for robustness
        for seed in [10, 19, 42, 69, 99]:
            set_seed(seed)
            print(f"Running evaluation with seed {seed}")
            
            # Create evaluation dataloader
            eval_dl = DataLoader(ds_test, batch_size=16)
            
            # Load model
            model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
            
            # Run evaluation
            res = eval_rogue(model, eval_dl)
            print(f"Results for seed {seed}: {res}")
            res_list.append(res)
        
        # Compute average results across seeds
        avg_results = pd.DataFrame(res_list).mean()
        print(f"Average results across seeds: {avg_results}")


if __name__ == "__main__":
    main()




