
from dataclasses import dataclass, field
import pyrallis
import os
from datasets import load_dataset

import warnings

from prompts import *
from language_model import LabelModel
from functools import partial
import srsly
from tqdm import tqdm
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import numpy as np

@dataclass
class DataConfig:
    response_path: str = None # huggingface hub path storing location of response 1
    split: str = "train" # dataset split for evaluation
    


def main():
    warnings.filterwarnings("ignore")
    cfg = pyrallis.parse(config_class=DataConfig)
    HF_CACHE = '/workspace/rlhf-code/.cache'
    
    
    os.makedirs(f"eval_results/imdb", exist_ok=True)
    ovr_path = f'eval_results/imdb/p1-{cfg.response_path.split("/")[-1]}'
    
    ds = load_dataset(cfg.response_path, split = cfg.split, cache_dir= HF_CACHE)
    
    model = RobertaForSequenceClassification.from_pretrained("siebert/sentiment-roberta-large-english").to('cuda')
    tokenizer = RobertaTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
    
    scores = []
    for i in tqdm(range(len(ds)), total=len(ds), desc="Evaluating IMDB responses"):
        review = ds[i]['query'] + ds[i]['model_response']
        inputs = tokenizer(review, padding=True, truncation=True, return_tensors="pt").to(model.device)
        score = model(**inputs).logits.softmax(dim=-1)[0][1].item()
        scores.append(score)
    
    mean_score = np.mean(scores)
    
    print(f"Mean Positive Sentiment Score: {mean_score}")
 
    
    srsly.write_json(ovr_path, {'mean_pos_score': mean_score, 
                                'resp_scores': scores,
                                            'response_path': cfg.response_path})

if __name__ == '__main__':
    main()
