# -*- coding: utf-8 -*-
"""MLS - Eval - English.ipynb

Automatically generated by Colab.

# Overview

This notebook:
1. Imports the SORRY-Bench evaluation dataset
2. Generate outputs from a target model using the SORRY-Bench dataset as an input
3. Runs the SORRY-Bench autorater (Mistral) on the generated outputs
4. Saves the results (Prompt, output, rating) to a unique stored Google Sheet

## Variables to Update
"""

MODEL_PATH = # Insert model path for evaluation here
MODEL_ID = # Insert model ID

"""## Memory reset"""

import shutil
import os
import gc
import torch

try:
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
    gc.collect()
    print("VRAM & RAM explicit garbage collection triggered.")
except Exception as e:
    print(f"VRAM clear warning: {e}")

cleanup_targets = [
    "/content/.cache/huggingface",
    os.path.expanduser("~/.cache/huggingface"),
    "/root/.cache/huggingface",

    os.path.expanduser("~/.triton"),
    os.path.expanduser("~/.cache/torch"),
    os.path.expanduser("~/.nv"),

    "/tmp/vllm",
    "/tmp/ray",
    "/dev/shm",
]

for target in cleanup_targets:
    if os.path.exists(target):
        print(f"Cleaning {target}...")
        try:
            shutil.rmtree(target)
        except Exception as e:
            print(f"   Python delete failed, forcing system delete on {target}")
            os.system(f"rm -rf {target}")

autorater_lfs = "/content/sorry-bench/ckpts/finetuned_models/ft-mistral-7b-instruct-v0.2-sorry-bench-202406/.git/lfs"
if os.path.exists(autorater_lfs):
    print("Removing autorater Git LFS cache...")
    shutil.rmtree(autorater_lfs)

output_dirs = [
    "/content/sorry-bench/data/sorry_bench/model_answer",
    "/content/sorry-bench/data/sorry_bench/model_judgment"
]
for dir_path in output_dirs:
    if os.path.exists(dir_path):
        print(f"🧹 Sweeping output directory: {dir_path}")
        for file in os.listdir(dir_path):
            os.remove(os.path.join(dir_path, file))

print(f"\n Autorater Preserved.")
print(f" COMPILER & WEIGHT CACHES DESTROYED.")
print(f" Ready for fresh download of: {MODEL_PATH}")

"""## Set up"""

# Commented out IPython magic to ensure Python compatibility.
!git clone https://github.com/lm-sys/FastChat.git
# %cd FastChat
!pip install -e ".[model_worker,llm_judge]"
!pip install vllm
# %cd ..
!git clone https://github.com/sorry-bench/sorry-bench.git
# %cd sorry-bench
!pip install gspread gspread-dataframe google-auth-oauthlib -q

import pandas as pd
import json
import os
from datetime import datetime
import google.auth
from google.colab import auth, drive, userdata
from google.auth import default
import gspread
from gspread_dataframe import set_with_dataframe


print("Authenticating user and mounting Drive...")
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)
drive.mount('/content/drive', force_remount=True)
print("Authentication and Drive mount successful.")

"""## Load dataset"""

# Commented out IPython magic to ensure Python compatibility.
try:
    hf_token = userdata.get('HF_KEY')
except userdata.SecretNotFoundError:
    raise ValueError("Hugging Face token not found. Please add it to Colab Secrets with the name 'HF_TOKEN'.")

# %cd /content/sorry-bench
!mkdir -p data/sorry_bench
# %cd data/sorry_bench
!git clone https://hf:{hf_token}@huggingface.co/datasets/sorry-bench/sorry-bench-202503
!mv sorry-bench-202503/* .
!rm -r sorry-bench-202503
# %cd /content/

print("Dataset downloaded successfully!")

"""## Generate Outputs"""

# Commented out IPython magic to ensure Python compatibility.
# %cd /content/sorry-bench

!git checkout gen_model_answer_vllm.py
print("✅ File reset to original state.")

file_path = 'gen_model_answer_vllm.py'
with open(file_path, 'r') as f:
    content = f.read()

if "max_model_len=8192" not in content:
    content = content.replace(
        "model = LLM(",
        "model = LLM(max_model_len=8192, "
    )
    print("Context window limited to 8192 tokens.")
else:
    print("Context window limit already present.")

if 'stop=[' not in content:
    content = content.replace(
        "SamplingParams(",
        "SamplingParams(stop=[\"### Human:\", \"### Assistant:\", \"<|eot_id|>\"], "
    )
    print("Stop tokens added.")
else:
    print("ℹStop tokens already present.")

with open(file_path, 'w') as f:
    f.write(content)

cmd = (
    f"python gen_model_answer_vllm.py "
    f"--bench-name sorry_bench "
    f"--model-path {MODEL_PATH} "
    f"--model-id {MODEL_ID} "
    f"--dtype bfloat16"
)

print(f"Running command:\n{cmd}")
!{cmd}

"""## Save Outputs to Drive"""

output_file_path = f'/content/sorry-bench/data/sorry_bench/model_answer/{MODEL_ID}.jsonl'
google_sheet_name = f'{MODEL_ID} - SORRY-Bench Outputs'

print(f"Attempting to load file from: {output_file_path}")

try:
    df = pd.read_json(output_file_path, lines=True)
    print("\nSuccessfully loaded the output file into a DataFrame.")

    sh = gc.create(google_sheet_name)
    worksheet = sh.get_worksheet(0)

    set_with_dataframe(worksheet, df)

    print(f"\n✅ Successfully saved data to Google Sheet!")
    print(f"🔗 Link: {sh.url}")

except FileNotFoundError:
    print(f"Error: Output file not found at '{output_file_path}'")
    print("\nDebugging tips:")
    print("1. Verify the generation script ran without errors.")
    print("2. Check the actual file path by running the following command in a new cell:")
    print(f"!ls -l /content/sorry-bench/data/sorry-bench/model_answer/")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

"""## Run Autorater"""

# Commented out IPython magic to ensure Python compatibility.
project_root = "/content/sorry-bench"
# %cd {project_root}

try:
    hf_token = userdata.get('HF_KEY')
except userdata.SecretNotFoundError:
    raise ValueError("Hugging Face token 'HF_KEY' not found in Colab Secrets.")

model_dir = os.path.join(project_root, "ckpts/finetuned_models/ft-mistral-7b-instruct-v0.2-sorry-bench-202406")
model_repo_url = f"https://hf:{hf_token}@huggingface.co/sorry-bench/ft-mistral-7b-instruct-v0.2-sorry-bench-202406"

if not os.path.exists(model_dir):
    print(f"Cloning autorater model into {model_dir}...")
    !GIT_LFS_SKIP_SMUDGE=1 git clone {model_repo_url} {model_dir}
    !cd {model_dir} && git lfs pull
else:
    print("Autorater model directory already exists. Skipping download.")

print("\nRunning the autorater judgment script...")

judgment_command = f"python gen_judgment_safety_vllm.py --model-list {MODEL_ID}"
print(f"Running command: {judgment_command}")
!{judgment_command}

"""## Save outputs"""

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
sheet_name = f"{MODEL_ID}_SORRY-Bench_Results_{timestamp}"

try:
    sh = gc.create(sheet_name)
    print(f"Created new Google Sheet: '{sheet_name}'")
    print(f"URL: {sh.url}")
except Exception as e:
    print(f"Error creating sheet (you may have hit a limit): {e}")

questions_df = pd.read_json('data/sorry_bench/question.jsonl', lines=True)
answers_df = pd.read_json(f'data/sorry_bench/model_answer/{MODEL_ID}.jsonl', lines=True)
judgments_df = pd.read_json('data/sorry_bench/model_judgment/ft-mistral-7b-instruct-v0.2.jsonl', lines=True)

def extract_output(choices):
    try:
        return choices[0]['turns'][0]
    except (IndexError, KeyError, TypeError):
        return "" # Handle failed generations safely

answers_df['Output'] = answers_df['choices'].apply(extract_output)

if 'score' in judgments_df.columns:
    judgments_df = judgments_df.rename(columns={'score': 'Rating'})

questions_df = questions_df[['question_id', 'category', 'turns']].rename(columns={'turns': 'Prompt'})
answers_df = answers_df[['question_id', 'Output']]
judgments_df = judgments_df[['question_id', 'Rating']]

merged_df = questions_df.merge(answers_df, on='question_id', how='inner')
merged_df = merged_df.merge(judgments_df, on='question_id', how='inner')

if len(merged_df) != len(questions_df):
    print(f"WARNING: Dropped {len(questions_df) - len(merged_df)} rows due to generation failures.")
else:
    print(f"Check passed: {len(merged_df)} records matched perfectly.")

merged_df['Prompt'] = merged_df['Prompt'].apply(lambda x: x[0] if isinstance(x, list) else x)

worksheet = sh.get_worksheet(0)
set_with_dataframe(worksheet, merged_df)
print("Results successfully saved.")