{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "from datasets import load_dataset\n",
    "\n",
    "import sys\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import ast\n",
    "\n",
    "import json\n",
    "\n",
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "from scipy.stats import pearsonr, spearmanr\n",
    "\n",
    "import numpy as np\n",
    "import os\n",
    "sys.path.append(\"..\")\n",
    "from weaver.constants import (\n",
    "    DATASET_TO_REWARD_MODELS, \n",
    "    DATASET_TO_LM_JUDGES,\n",
    "    VERIFIER_NAME_MAP,\n",
    "    VERIFIER_DESCRIPTIONS,\n",
    "    DATASET_TO_HF\n",
    ")\n",
    "# Add the project root to sys.path\n",
    "sys.path.append(\"..\")\n",
    "\n",
    "# Now the import should work\n",
    "from weaver.dataset import VerificationDataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Inspect reward model distributions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "datasets_8b = [entry['8B'] for dataset, entry  in DATASET_TO_HF.items() if \"CodeContests\" not in dataset]\n",
    "datasets_70b = [entry['70B'] for dataset, entry  in DATASET_TO_HF.items() if \"CodeContests\" not in dataset]\n",
    "\n",
    "dataset_names = [v for v in DATASET_TO_HF if \"CodeContests\" not in v]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reward_models_8b = DATASET_TO_REWARD_MODELS[datasets_8b[0]]\n",
    "reward_models_70b = DATASET_TO_REWARD_MODELS[datasets_70b[0]]\n",
    "\n",
    "lm_judges_8b = DATASET_TO_LM_JUDGES[datasets_8b[0]]\n",
    "lm_judges_70b = DATASET_TO_LM_JUDGES[datasets_70b[0]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_cfg = {\n",
    " 'model_size': '8B',\n",
    " 'random_seed': 0,\n",
    " 'train_split': 1,\n",
    " 'dataset_name': 'AIMO',\n",
    " 'verifier_cfg': {'verifier_size': 80,\n",
    "  'verifier_type': 'all',\n",
    "  'verifier_subset': None},\n",
    " 'train_queries': 1,\n",
    " 'train_samples': 1,\n",
    " 'mv_as_verifier': True,\n",
    " 'normalize_type': 'all_problems',\n",
    " 'nan_replacement': 0,\n",
    " 'same_train_test': False,\n",
    " 'fixed_test_split': None,\n",
    " 'normalize_method': 'minmax',\n",
    " 'normalize_params': {'tmp': None},\n",
    " 'reward_threshold': None,\n",
    " 'closest_train_problem_method': 'mean_verifier_distance',\n",
    " 'closest_train_problem_metric_type': 'euclidean'\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8B datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict \n",
    "reward_model_scores = defaultdict(list)\n",
    "all_labels = []\n",
    "for d in dataset_names:\n",
    "    print(d)\n",
    "\n",
    "    hf_path = DATASET_TO_HF[d]['8B']\n",
    "\n",
    "    dataset = load_dataset(hf_path)['data']\n",
    "\n",
    "    base_cfg['dataset_name'] = d\n",
    "    verifier_ds = VerificationDataset(**base_cfg)\n",
    "\n",
    "    for rm in reward_models_8b + ['mv_verifier']:\n",
    "        if \"_step\" in rm:\n",
    "            continue \n",
    "\n",
    "        if rm == 'mv_verifier':\n",
    "            reward_model_scores[rm].append(verifier_ds.test_data[0][:, :, -1])\n",
    "        else:\n",
    "            reward_model_scores[rm].append(np.array(dataset[rm]))\n",
    "\n",
    "        #assert np.array(dataset['answer_correct']).shape == np.array(dataset[rm]).shape\n",
    "        \n",
    "    all_labels.append(np.array(dataset['answer_correct']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(4, 5, figsize=(25, 15))\n",
    "\n",
    "flattened_labels = np.array([generation_label for dataset_labels in all_labels for problem_labels in dataset_labels for generation_label in problem_labels])\n",
    "\n",
    "\n",
    "for i, (rm, scores) in enumerate(reward_model_scores.items()):\n",
    "\n",
    "    flattened_scores = np.array([generation_score for data_score in scores for problem_score in data_score for generation_score in problem_score]).astype(float)\n",
    "    nan_idxs = np.isnan(flattened_scores)\n",
    "    flattened_scores = flattened_scores[~nan_idxs]\n",
    "    flattened_labels_no_nans = flattened_labels[~nan_idxs]\n",
    "\n",
    "    flattened_scores_correct = flattened_scores[flattened_labels_no_nans]\n",
    "    flattened_scores_incorrect = flattened_scores[~flattened_labels_no_nans]\n",
    "\n",
    "    bins = np.histogram_bin_edges(flattened_scores, bins=20)  # Ensure both use the same bins\n",
    "\n",
    "    # Plot histograms using the same bins\n",
    "    ax[i//5, i%5].hist(flattened_scores[flattened_labels_no_nans], bins=bins, alpha=0.5, label=\"Correct\")\n",
    "    ax[i//5, i%5].hist(flattened_scores[~flattened_labels_no_nans], bins=bins, alpha=0.5, label=\"Incorrect\")\n",
    "    ax[i//5, i%5].set_title(rm)\n",
    "    ax[i//5, i%5].set_yticklabels([])\n",
    "    ax[i//5, i%5].legend()\n",
    "\n",
    "plt.suptitle(\"Reward model output distribution across 8B datasets\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Individual 8B datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict \n",
    "all_labels = []\n",
    "for d in dataset_names:\n",
    "    fig, ax = plt.subplots(4, 5, figsize=(25, 15))\n",
    "    print(d)\n",
    "\n",
    "    hf_name = DATASET_TO_HF[d]['8B']\n",
    "    dataset = load_dataset(hf_name)['data']\n",
    "    all_labels= np.array(dataset['answer_correct']).flatten()\n",
    "\n",
    "    base_cfg['dataset_name'] = d\n",
    "    verifier_ds = VerificationDataset(**base_cfg)\n",
    "\n",
    "    i = 0\n",
    "    for rm in reward_models_8b + ['mv_verifier']:\n",
    "        if \"_step\" in rm:\n",
    "            continue \n",
    "\n",
    "        if rm == 'mv_verifier':\n",
    "            scores = verifier_ds.test_data[0][:, :, -1].flatten()\n",
    "        else:\n",
    "            scores = np.array(dataset[rm]).flatten().astype(float)\n",
    "\n",
    "        nan_idxs = np.isnan(scores)\n",
    "        scores = scores[~nan_idxs]\n",
    "        labels_no_nans = all_labels[~nan_idxs]\n",
    "\n",
    "        bins = np.histogram_bin_edges(scores, bins=20)  # Ensure both use the same bins\n",
    "\n",
    "        # Plot histograms using the same bins\n",
    "        ax[i//5, i%5].hist(scores[labels_no_nans], bins=bins, alpha=0.5, label=\"Correct\")\n",
    "        ax[i//5, i%5].hist(scores[~labels_no_nans], bins=bins, alpha=0.5, label=\"Incorrect\")\n",
    "        ax[i//5, i%5].set_title(rm)\n",
    "        ax[i//5, i%5].set_yticklabels([])\n",
    "        ax[i//5, i%5].legend()\n",
    "        i+=1\n",
    "\n",
    "    plt.suptitle(d, fontsize=18)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 70B datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict \n",
    "verifier_scores = defaultdict(list)\n",
    "all_labels = []\n",
    "for d in dataset_names:\n",
    "    print(d)\n",
    "    hf_path = DATASET_TO_HF[d]['70B']\n",
    "\n",
    "    dataset = load_dataset(hf_path)['data']\n",
    "\n",
    "    base_cfg['dataset_name'] = d\n",
    "    base_cfg['model_size'] = '70B'\n",
    "    verifier_ds = VerificationDataset(**base_cfg)\n",
    "\n",
    "    for v in reward_models_70b + lm_judges_70b + ['mv_verifier']:\n",
    "        if \"_step\" in v:\n",
    "            continue \n",
    "        \n",
    "\n",
    "        if \"Claude\" in v or \"GPT\" in v:\n",
    "            continue \n",
    "\n",
    "        if v == 'mv_verifier':\n",
    "            verifier_scores[v].append(verifier_ds.test_data[0][:, :, -1])\n",
    "        else:\n",
    "            verifier_scores[v].append(np.array(dataset[v]).squeeze())\n",
    "\n",
    "        #assert np.array(dataset['answer_correct']).shape == np.array(dataset[v]).squeeze().shape, (np.array(dataset['answer_correct']).shape, np.array(dataset[v]).squeeze().shape)\n",
    "        \n",
    "    all_labels.append(np.array(dataset['answer_correct']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(6, 6, figsize=(40, 25))\n",
    "\n",
    "flattened_labels = np.array([generation_label for dataset_labels in all_labels for problem_labels in dataset_labels for generation_label in problem_labels])\n",
    "\n",
    "\n",
    "for i, (v, scores) in enumerate(verifier_scores.items()):\n",
    "\n",
    "    flattened_scores = np.array([generation_score for data_score in scores for problem_score in data_score for generation_score in problem_score]).astype(float)\n",
    "    nan_idxs = np.isnan(flattened_scores)\n",
    "    flattened_scores = flattened_scores[~nan_idxs]\n",
    "    flattened_labels_no_nans = flattened_labels[~nan_idxs]\n",
    "\n",
    "    flattened_scores_correct = flattened_scores[flattened_labels_no_nans]\n",
    "    flattened_scores_incorrect = flattened_scores[~flattened_labels_no_nans]\n",
    "\n",
    "    bins = np.histogram_bin_edges(flattened_scores, bins=20)  # Ensure both use the same bins\n",
    "\n",
    "    # Plot histograms using the same bins\n",
    "    ax[i//6, i%6].hist(flattened_scores[flattened_labels_no_nans], bins=bins, alpha=0.5, label=\"Correct\")\n",
    "    ax[i//6, i%6].hist(flattened_scores[~flattened_labels_no_nans], bins=bins, alpha=0.5, label=\"Incorrect\")\n",
    "    ax[i//6, i%6].set_title(v)\n",
    "    ax[i//6, i%6].set_yticklabels([])\n",
    "    ax[i//6, i%6].legend()\n",
    "\n",
    "plt.suptitle(\"Verifier output distribution across 70B datasets\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Individual 70B datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict \n",
    "all_labels = []\n",
    "for d in dataset_names:\n",
    "    fig, ax = plt.subplots(6, 6, figsize=(40, 25))\n",
    "    print(d)\n",
    "    hf_path = DATASET_TO_HF[d]['70B']\n",
    "\n",
    "    dataset = load_dataset(hf_path)['data']\n",
    "\n",
    "    base_cfg['dataset_name'] = d\n",
    "    base_cfg['model_size'] = '70B'\n",
    "    verifier_ds = VerificationDataset(**base_cfg)\n",
    "\n",
    "    all_labels= np.array(dataset['answer_correct']).flatten()\n",
    "\n",
    "    i = 0\n",
    "    for v in (reward_models_70b + lm_judges_70b + ['mv_verifier']):\n",
    "        if \"_step\" in v:\n",
    "            continue \n",
    "        if \"Claude\" in v or \"GPT\" in v:\n",
    "            continue \n",
    "\n",
    "        if v == 'mv_verifier':\n",
    "            scores = verifier_ds.test_data[0][:, :, -1].flatten()\n",
    "        else:\n",
    "            scores = np.array(dataset[v]).flatten().astype(float)\n",
    "\n",
    "        nan_idxs = np.isnan(scores)\n",
    "        scores = scores[~nan_idxs]\n",
    "        labels_no_nans = all_labels[~nan_idxs]\n",
    "\n",
    "        bins = np.histogram_bin_edges(scores, bins=20)  # Ensure both use the same bins\n",
    "\n",
    "        # Plot histograms using the same bins\n",
    "        ax[i//6, i%6].hist(scores[labels_no_nans], bins=bins, alpha=0.5, label=\"Correct\")\n",
    "        ax[i//6, i%6].hist(scores[~labels_no_nans], bins=bins, alpha=0.5, label=\"Incorrect\")\n",
    "\n",
    "        ax[i//6, i%6].set_title(v, fontsize=20)\n",
    "        ax[i//6, i%6].set_yticklabels([])\n",
    "        ax[i//6, i%6].legend()\n",
    "        i+=1\n",
    "\n",
    "    plt.suptitle(d, fontsize=28)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Example: inspect ArmorRM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n",
    "device = \"cuda\"\n",
    "path = \"RLHFlow/ArmoRM-Llama3-8B-v0.1\"\n",
    "model = AutoModelForSequenceClassification.from_pretrained(path, device_map=device, \n",
    "                               trust_remote_code=True, torch_dtype=torch.bfloat16)\n",
    "tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random\n",
    "random.seed(42)\n",
    "random_sentence = \" \".join(random.choices(list(tokenizer.get_vocab().keys()), k=30))\n",
    "print(random_sentence)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "prompt = \"What are some synonyms for the word 'beautiful'?\"\n",
    "response1 = random_sentence\n",
    "\n",
    "response2 = '''Certainly! Here are some synonyms for the word \"beautiful\":\n",
    "\n",
    "1. Gorgeous\n",
    "2. Lovely\n",
    "3. Stunning\n",
    "4. Attractive\n",
    "5. Pretty\n",
    "6. Elegant\n",
    "7. Exquisite\n",
    "8. Handsome\n",
    "9. Charming\n",
    "10. Alluring\n",
    "11. Radiant\n",
    "12. Magnificent\n",
    "13. Graceful\n",
    "14. Enchanting\n",
    "15. Dazzling\n",
    "\n",
    "These synonyms can be used in various contexts to convey the idea of beauty.'''\n",
    "\n",
    "messages = [{\"role\": \"user\", \"content\": prompt},\n",
    "           {\"role\": \"assistant\", \"content\": response1}]\n",
    "input_ids = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(device)\n",
    "with torch.no_grad():\n",
    "   output = model(input_ids)\n",
    "   # Multi-objective rewards for the response\n",
    "   multi_obj_rewards = output.rewards.cpu().float() \n",
    "   # The gating layer's output is conditioned on the prompt\n",
    "   gating_output = output.gating_output.cpu().float()\n",
    "   # The preference score for the response, aggregated from the \n",
    "   # multi-objective rewards with the gating layer\n",
    "   preference_score = output.score.cpu().float()  \n",
    "# We apply a transformation matrix to the multi-objective rewards\n",
    "# before multiplying with the gating layer's output. This mainly aims\n",
    "# at reducing the verbosity bias of the original reward objectives\n",
    "obj_transform = model.reward_transform_matrix.data.cpu().float()\n",
    "# The final coefficients assigned to each reward objective\n",
    "multi_obj_coeffs = gating_output @ obj_transform.T\n",
    "# The preference score is the linear combination of the multi-objective rewards with\n",
    "# the multi-objective coefficients, which can be verified by the following assertion\n",
    "assert torch.isclose(torch.sum(multi_obj_rewards * multi_obj_coeffs, dim=1), preference_score, atol=1e-3) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "multi_obj_rewards"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "messages = [{\"role\": \"user\", \"content\": prompt},\n",
    "           {\"role\": \"assistant\", \"content\": response2}]\n",
    "input_ids = tokenizer.apply_chat_template(messages, return_tensors=\"pt\").to(device)\n",
    "with torch.no_grad():\n",
    "   output = model(input_ids)\n",
    "   # Multi-objective rewards for the response\n",
    "   multi_obj_rewards = output.rewards.cpu().float() \n",
    "   # The gating layer's output is conditioned on the prompt\n",
    "   gating_output = output.gating_output.cpu().float()\n",
    "   # The preference score for the response, aggregated from the \n",
    "   # multi-objective rewards with the gating layer\n",
    "   preference_score = output.score.cpu().float()  \n",
    "# We apply a transformation matrix to the multi-objective rewards\n",
    "# before multiplying with the gating layer's output. This mainly aims\n",
    "# at reducing the verbosity bias of the original reward objectives\n",
    "obj_transform = model.reward_transform_matrix.data.cpu().float()\n",
    "# The final coefficients assigned to each reward objective\n",
    "multi_obj_coeffs = gating_output @ obj_transform.T\n",
    "# The preference score is the linear combination of the multi-objective rewards with\n",
    "# the multi-objective coefficients, which can be verified by the following assertion\n",
    "assert torch.isclose(torch.sum(multi_obj_rewards * multi_obj_coeffs, dim=1), preference_score, atol=1e-3) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "multi_obj_rewards"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mayeeenv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
