{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44a7767f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import List\n",
    "\n",
    "def pass_at_1_from_scores(scores: List[List[int]]) -> float:\n",
    "    \"\"\"\n",
    "    DeepSeek-style pass@1 estimator from k samples per problem.\n",
    "    \"\"\"\n",
    "    total, correct = 0, 0\n",
    "    for row in scores:\n",
    "        correct += sum(int(x) for x in row)\n",
    "        total   += len(row)\n",
    "    return (correct / total) if total else 0.0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48d51eec",
   "metadata": {},
   "outputs": [],
   "source": [
    "TEST_DIRS=[\n",
    "    \"datasets/test_codeforces.parquet\",\n",
    "    \"datasets/test_lcbv5.parquet\",\n",
    "]\n",
    "\n",
    "MODEL=\"Qwen/Qwen3-1.7B\"\n",
    "OUTPUT_DIR=\"outputs/inference_results/16K/\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84c96fcd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "import datasets\n",
    "from pathlib import Path\n",
    "from tqdm.auto import tqdm \n",
    "from concurrent.futures import ProcessPoolExecutor, as_completed\n",
    "\n",
    "from verl.utils.reward_score import default_compute_score\n",
    "\n",
    "def reward_model_processor(reward_models):\n",
    "    processed_reward_models = []\n",
    "    for reward_model in reward_models:\n",
    "        if type(reward_model) == str:\n",
    "            reward_model = json.loads(reward_model)\n",
    "            reward_model[\"ground_truth\"] = json.dumps(reward_model[\"ground_truth\"])\n",
    "        processed_reward_models.append(reward_model)\n",
    "    return {\"reward_model\": processed_reward_models}\n",
    "\n",
    "def load_dataset(parquet_path):\n",
    "    dataframe = datasets.load_dataset(\"parquet\", data_files=parquet_path)[\"train\"]\n",
    "    dataframe = dataframe.map(reward_model_processor,\n",
    "        input_columns=\"reward_model\",\n",
    "        batched=True, batch_size=16,\n",
    "        num_proc=8,\n",
    "        desc=\"Processing reward_model\"\n",
    "    )\n",
    "    df = dataframe.to_pandas()\n",
    "    return df\n",
    "\n",
    "def _score_prompt_bool(args):\n",
    "    \"\"\"\n",
    "    Worker: compute a boolean pass/fail list for one prompt.\n",
    "    \"\"\"\n",
    "    p, gts, outs, data_src = args\n",
    "    scores_p = []\n",
    "    for o in outs:\n",
    "        try:\n",
    "            scores_p.append(default_compute_score(data_src, o, gts) == 1)\n",
    "        except Exception:\n",
    "            # If scoring a single output fails, count it as False and continue\n",
    "            scores_p.append(False)\n",
    "    return p, scores_p\n",
    "\n",
    "def compute_scores_all_mp(original_prompts, ground_truths, generations, data_src, max_workers=None):\n",
    "    \"\"\"\n",
    "    Parallel replacement for:\n",
    "\n",
    "        scores_all = []\n",
    "        for p, gts in tqdm(zip(original_prompts, ground_truths), total=len(original_prompts)):\n",
    "            po = generations[str(p)][\"generations\"]\n",
    "            scores_p = []\n",
    "            for o in po:\n",
    "                scores_p.append(default_compute_score(data_src, o, gts)==1)\n",
    "            scores_all.append(scores_p)\n",
    "\n",
    "    Returns:\n",
    "        scores_all: list[list[bool]] aligned with original_prompts\n",
    "    \"\"\"\n",
    "    assert len(original_prompts) == len(ground_truths), \"prompts and ground_truths length mismatch\"\n",
    "\n",
    "    # Build per-prompt job args\n",
    "    jobs = []\n",
    "    for p, gts in zip(original_prompts, ground_truths):\n",
    "        outs = generations[str(p)][\"generations\"]\n",
    "        jobs.append((p, gts, outs, data_src))\n",
    "\n",
    "    if max_workers is None:\n",
    "        # leave a little headroom\n",
    "        max_workers = max((os.cpu_count() or 2) - 1, 1)\n",
    "\n",
    "    scores_map = {}\n",
    "    with ProcessPoolExecutor(max_workers=max_workers) as ex:\n",
    "        futures = [ex.submit(_score_prompt_bool, j) for j in jobs]\n",
    "        with tqdm(total=len(jobs), desc=\"Scoring\", dynamic_ncols=True) as pbar:\n",
    "            for fut in as_completed(futures):\n",
    "                try:\n",
    "                    p, scores_p = fut.result()\n",
    "                    scores_map[str(p)] = scores_p\n",
    "                except Exception as e:\n",
    "                    # Keep going even if one prompt fails completely\n",
    "                    print(f\"[WARN] scoring failed for a prompt: {e}\")\n",
    "                finally:\n",
    "                    pbar.update(1)\n",
    "\n",
    "    # Rebuild in original order\n",
    "    scores_all = [scores_map.get(str(p), [False] * len(generations[str(p)][\"generations\"]))\n",
    "                  for p in original_prompts]\n",
    "    return scores_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d47c5997",
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {}\n",
    "for parquet_path in TEST_DIRS:\n",
    "    df = load_dataset(parquet_path)\n",
    "    original_prompts = df[\"prompt\"].tolist()\n",
    "    ground_truths = df[\"reward_model\"].apply(lambda x: x[\"ground_truth\"]).values\n",
    "    data_src = df[\"data_source\"].iloc[0]\n",
    "    print(data_src)\n",
    "\n",
    "    output_filename = f\"{Path(data_src).name}_{Path(MODEL).name}_generations.json\"\n",
    "    output_path = os.path.join(OUTPUT_DIR, output_filename)\n",
    "    with open(output_path, \"r\") as f:\n",
    "        generations = json.load(f)\n",
    "\n",
    "    scores_all = compute_scores_all_mp(original_prompts, ground_truths, generations, data_src)\n",
    "    results[data_src] = 100*pass_at_1_from_scores(scores_all)\n",
    "    print(f\"{data_src} pass@1\", results[data_src])\n",
    "\n",
    "print(results)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28d3ca82",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "verl",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
