{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from dataclasses import dataclass, field\n",
    "from typing import Optional\n",
    "import numpy as np\n",
    "import torch\n",
    "from datasets import load_dataset\n",
    "from tqdm import tqdm\n",
    "from transformers import AutoTokenizer, HfArgumentParser, pipeline\n",
    "from accelerate import Accelerator\n",
    "\n",
    "tqdm.pandas()\n",
    "\n",
    "\n",
    "\n",
    "# Input the evaluation file \n",
    "ds_popo = load_dataset(\"json\", data_files=\"../data/alphca_eval_popo.json\", split=\"train\")\n",
    "ds_xpo = load_dataset(\"json\", data_files=\"../data/alphca_eval_xpo.json\", split=\"train\")\n",
    "ds_dpo = load_dataset(\"json\", data_files=\"../data/alphca_eval_dpo.json\", split=\"train\")\n",
    "\n",
    "baseline = load_dataset(\"json\", data_files=\"../data/alphca_eval_LLaMA3_SFT.json\", split=\"train\")\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accelerator = Accelerator()\n",
    "device = accelerator.device\n",
    "pipe_kwargs = {\n",
    "    \"return_all_scores\": True,\n",
    "    \"function_to_apply\": \"none\",\n",
    "    \"batch_size\": 4,\n",
    "}\n",
    "reward_model = \"Ray2333/GRM-Llama3-8B-rewardmodel-ft\"\n",
    "rm_tokenizer = AutoTokenizer.from_pretrained(reward_model)\n",
    "rm_pipe = pipeline(\n",
    "    \"sentiment-analysis\",\n",
    "    model=reward_model,\n",
    "    device=device,\n",
    "    tokenizer=rm_tokenizer,\n",
    "    model_kwargs={\"torch_dtype\": torch.bfloat16},\n",
    "    truncation=True,\n",
    ")\n",
    "\n",
    "def get_reward(test_texts):\n",
    "    pipe_outputs = rm_pipe(test_texts, **pipe_kwargs)\n",
    "    rewards = [output[0][\"score\"] for output in pipe_outputs]\n",
    "    return rewards\n",
    "\n",
    "\n",
    "def change_of_format(prom, resp):\n",
    "    message = [{\"role\": \"user\", \"content\": prom}] + [{\"role\": \"assistant\", \"content\": resp}]\n",
    "    return rm_tokenizer.apply_chat_template(message, tokenize=False).replace(rm_tokenizer.bos_token, \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "data_dpo = []\n",
    "with torch.no_grad():\n",
    "    for sample in tqdm(ds_dpo):\n",
    "        # The VLLM may not generate responses for some prompts because it is too long, we skip them\n",
    "        test_texts = [change_of_format(sample['instruction'], sample['output'])]\n",
    "        \n",
    "        rewards = get_reward(test_texts)\n",
    "        data_dpo.append({\"prompt\": sample[\"instruction\"], \"responses\": sample[\"output\"], \"rewards\": rewards})\n",
    "data_xpo = []\n",
    "with torch.no_grad():\n",
    "    for sample in tqdm(ds_xpo):\n",
    "        # The VLLM may not generate responses for some prompts because it is too long, we skip them\n",
    "        test_texts = [change_of_format(sample['instruction'], sample['output'])]\n",
    "        \n",
    "        rewards = get_reward(test_texts)\n",
    "        data_xpo.append({\"prompt\": sample[\"instruction\"], \"responses\": sample[\"output\"], \"rewards\": rewards})\n",
    "data_popo = []\n",
    "with torch.no_grad():\n",
    "    for sample in tqdm(ds_popo):\n",
    "        # The VLLM may not generate responses for some prompts because it is too long, we skip them\n",
    "        test_texts = [change_of_format(sample['instruction'], sample['output'])]\n",
    "        \n",
    "        rewards = get_reward(test_texts)\n",
    "        data_popo.append({\"prompt\": sample[\"instruction\"], \"responses\": sample[\"output\"], \"rewards\": rewards})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_bl = []\n",
    "with torch.no_grad():\n",
    "    for sample in tqdm(baseline):\n",
    "        # The VLLM may not generate responses for some prompts because it is too long, we skip them\n",
    "        test_texts = [change_of_format(sample['instruction'], sample['output'])]\n",
    "        \n",
    "        rewards = get_reward(test_texts)\n",
    "        data_bl.append({\"prompt\": sample[\"instruction\"], \"responses\": sample[\"output\"], \"rewards\": rewards})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "winrate_dpo = [(data_dpo[i][\"rewards\"][0]>data_bl[i][\"rewards\"][0]) for i in range(len(data_bl))]+ [(data_dpo[i][\"rewards\"][0]>=data_bl[i][\"rewards\"][0]) for i in range(len(data_bl))]\n",
    "winrate_xpo = [(data_xpo[i][\"rewards\"][0]>data_bl[i][\"rewards\"][0]) for i in range(len(data_bl))]+ [(data_xpo[i][\"rewards\"][0]>=data_bl[i][\"rewards\"][0]) for i in range(len(data_bl))]\n",
    "winrate_popo = [(data_popo[i][\"rewards\"][0]>data_bl[i][\"rewards\"][0]) for i in range(len(data_bl))]+ [(data_popo[i][\"rewards\"][0]>=data_bl[i][\"rewards\"][0]) for i in range(len(data_bl))]\n",
    "\n",
    "\n",
    "print(\"WR:\")\n",
    "print(\"DPO:\", np.mean(winrate_dpo))\n",
    "print(\"XPO:\",np.mean(winrate_xpo))\n",
    "print(\"POPO:\", np.mean(winrate_wpo))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"AvgV:\")\n",
    "print(\"DPO:\", np.mean([data_dpo[i][\"rewards\"][0] for i in range(len(data_dpo))]))\n",
    "print(\"XPO:\",np.mean([data_xpo[i][\"rewards\"][0] for i in range(len(data_xpo))]))\n",
    "print(\"POPO:\",np.mean([data_popo[i][\"rewards\"][0] for i in range(len(data_popo))]))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "vllm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
