{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2ddc1af8",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_source = ['math-amc', 'math-aime', 'gpqa', 'mip-insufficient', 'mip-question', 'bbh', 'bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_logical_deduction_five_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_snarks', 'underthink-bench', ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "5217088d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 09-23 15:22:55 [importing.py:53] Triton module has been replaced with a placeholder.\n",
      "INFO 09-23 15:22:56 [__init__.py:239] Automatically detected platform cuda.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "os.environ['HF_HOME'] = 'NONE.cache/huggingface'\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '7'\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
    "from vllm import LLM\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen3-4B\")\n",
    "# Qwen/Qwen3-4B\n",
    "# deepseek-ai/DeepSeek-R1-Distill-Llama-8B\n",
    "# deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\n",
    "# model = LLM(model=\"Qwen/Qwen3-4B\", gpu_memory_utilization=0.95, tensor_parallel_size=1, task=\"embed\")\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d5eca1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ## SUPERGPQA\n",
    "# BaseModelPath='NONEcode/verl-fork/verl/scripts/eval/verlCheckpoint/mathbaseRun/qwen4b_10k_base_model/rollout/val_0_v6.jsonl'\n",
    "# CorrectnessOnlyPath='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/CorrectnessOnly/qwen4b_dapo_math_10k_correctnessOnly/rollout/val_140_v6.jsonl'\n",
    "# no_summary_path='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/NonSummary/qwen4b_dapo_math_10k_context_linear_reward_no_summary_no_difficulty/rollout/val_140_v5.jsonl'\n",
    "# ourMethodPath='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/Summary/qwen4b_dapo_math_10k_context_linear_reward_with_summary_attention/rollout/val_140_v5.jsonl'\n",
    "\n",
    "## GPQA\n",
    "# BaseModelPath='NONEcode/verl-fork/verl/scripts/eval/verlCheckpoint/mathbaseRun/qwen4b_10k_base_model/rollout/val_0_v0.jsonl'\n",
    "# CorrectnessOnlyPath='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/CorrectnessOnly/qwen4b_dapo_math_10k_correctnessOnly/rollout/val_140_v0.jsonl'\n",
    "# no_summary_path='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/NonSummary/qwen4b_dapo_math_10k_context_linear_reward_no_summary_no_difficulty/rollout/val_140_v0.jsonl'\n",
    "# ourMethodPath='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/Summary/qwen4b_dapo_math_10k_context_linear_reward_with_summary_attention/rollout/val_140_v0.jsonl'\n",
    "\n",
    "\n",
    "## BBEH\n",
    "# BaseModelPath='NONEcode/verl-fork/verl/scripts/eval/verlCheckpoint/mathbaseRun/qwen4b_10k_base_model/rollout/val_0_v5.jsonl'\n",
    "# CorrectnessOnlyPath='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/CorrectnessOnly/qwen4b_dapo_math_10k_correctnessOnly/rollout/val_140_v5.jsonl'\n",
    "# no_summary_path='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/NonSummary/qwen4b_dapo_math_10k_context_linear_reward_no_summary_no_difficulty/rollout/val_140_v4.jsonl'\n",
    "# ourMethodPath='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/Summary/qwen4b_dapo_math_10k_context_linear_reward_with_summary_attention/rollout/val_140_v4.jsonl'\n",
    "\n",
    "# ## BBH\n",
    "# BaseModelPath='NONEcode/verl-fork/verl/scripts/eval/verlCheckpoint/mathbaseRun/qwen4b_10k_base_model/rollout/val_0_v2.jsonl'\n",
    "# CorrectnessOnlyPath='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/CorrectnessOnly/qwen4b_dapo_math_10k_correctnessOnly/rollout/val_140_v2.jsonl'\n",
    "# no_summary_path='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/NonSummary/qwen4b_dapo_math_10k_context_linear_reward_no_summary_no_difficulty/rollout/val_140_v1.jsonl'\n",
    "# ourMethodPath='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/Summary/qwen4b_dapo_math_10k_context_linear_reward_with_summary_attention/rollout/val_140_v2.jsonl'\n",
    "\n",
    "\n",
    "# compression_only AIME, AMC, GQPA, SUPERGPQA, BBH\n",
    "# compression_only_path='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/Summary/qwen4b_dapo_math_10k_context_linear_reward_attention_no_difficulty/rollout/val_120_v0.jsonl'\n",
    "\n",
    "## compression_only BBEH \n",
    "# compression_only_path='NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/Summary/qwen4b_dapo_math_10k_context_linear_reward_attention_no_difficulty/rollout/val_120_v1.jsonl'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "id": "739a2ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "Baseline_model_data = []\n",
    "with open(BaseModelPath, 'r') as f:\n",
    "    for line in f:\n",
    "        data = json.loads(line)\n",
    "        if 'bbh_' in data['data_source']:\n",
    "            data['data_source'] = 'bbh'\n",
    "        Baseline_model_data.append(data)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 187,
   "id": "401797bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "second_method_data = []\n",
    "with open(CorrectnessOnlyPath, 'r') as f:\n",
    "    for line in f:\n",
    "        data = json.loads(line)\n",
    "        if 'bbh_' in data['data_source']:\n",
    "            data['data_source'] = 'bbh'\n",
    "        second_method_data.append(data)\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 188,
   "id": "87bdc386",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10575, 10575)"
      ]
     },
     "execution_count": 188,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(Baseline_model_data), len(second_method_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "id": "83d37c04",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data_source': 'bbh',\n",
       " 'prompt': \"user\\nThe following paragraphs each describe a set of five objects arranged in a fixed order. The statements are logically consistent within each paragraph. A fruit stand sells five fruits: apples, mangoes, oranges, plums, and pears. The oranges are more expensive than the mangoes. The plums are the cheapest. The apples are less expensive than the mangoes. The pears are more expensive than the oranges.\\nOptions:\\n(A) The apples are the most expensive\\n(B) The mangoes are the most expensive\\n(C) The oranges are the most expensive\\n(D) The plums are the most expensive\\n(E) The pears are the most expensive Let's think step by step and output the final answer within \\\\boxed{}.\\nassistant\\n\",\n",
       " 'response': \"<think>\\nOkay, let's try to figure this out. So, there are five fruits: apples, mangoes, oranges, plums, and pears. The problem gives me some inequalities about their prices. Let me list them out again to make sure I have them all:\\n\\n1. Oranges are more expensive than mangoes. So, Oranges > Mangoes.\\n2. Plums are the cheapest. So, Plums are the lowest price.\\n3. Apples are less expensive than mangoes. So, Apples < Mangoes.\\n4. Pears are more expensive than oranges. So, Pears > Oranges.\\n\\nAnd I need to determine which fruit is the most expensive. The options are between the apples, mangoes, oranges, plums, or pears.\\n\\nLet me start by ordering them based on the given information. Since plums are the cheapest, that's fixed. So, Plums are at the bottom.\\n\\nNow, from the third statement, apples are cheaper than mangoes. So Apples < Mangoes. And from the first statement, Oranges > Mangoes. So combining these, Oranges are more expensive than mangoes, which are more expensive than apples. So that would be Oranges > Mangoes > Apples. But where do plums fit in? Well, plums are the cheapest, so they are below apples, right? Wait, but apples are already cheaper than mangoes, but we don't know how apples compare to plums. Wait, plums are the cheapest, so plums are cheaper than all others. So, plums are the cheapest, so plums < apples, plums < mangoes, plums < oranges, plums < pears. Wait, but the problem says that pears are more expensive than oranges, so pears are the highest? Let me check.\\n\\nSo, let me try to write the order step by step.\\n\\nWe know:\\n\\nPlums are the cheapest. So, Plums < all others.\\n\\nApples < Mangoes.\\n\\nOranges > Mangoes.\\n\\nPears > Oranges.\\n\\nSo, putting these together:\\n\\nPears are more expensive than Oranges, which are more expensive than Mangoes, which are more expensive than Apples. And Plums are the cheapest.\\n\\nSo, the order from most expensive to least would be:\\n\\nPears > Oranges > Mangoes > Apples > Plums.\\n\\nWait, but is that correct? Let me check again. Let me make sure that all the inequalities are satisfied.\\n\\n- Pears > Oranges: Yes, that's given.\\n\\n- Oranges > Mangoes: Yes.\\n\\n- Mangoes > Apples: Yes.\\n\\n- Plums are the cheapest: Yes, so they are less than all others.\\n\\nSo, that would make pears the most expensive, then oranges, then mangoes, then apples, then plums. So the answer would be option E: The pears are the most expensive.\\n\\nWait, but let me check if there's any possibility that another fruit could be higher. For example, could pears be higher than oranges, but is there any other constraints?\\n\\nNo, because pears are explicitly stated to be more expensive than oranges. So the order is pears > oranges > mangoes > apples > plums. Therefore, pears are the most expensive. So option E.\\n\\nLet me check the options again:\\n\\n(A) Apples - no, they're the cheapest among the non-plums, but plums are the cheapest overall. So no.\\n\\n(B) Mangoes - no, because oranges and pears are higher.\\n\\n(C) Oranges - no, pears are higher.\\n\\n(D) Plums - no, they are the cheapest.\\n\\n(E) Pears - yes.\\n\\nTherefore, the answer should be E.\\n</think>\\n\\nTo determine which fruit is the most expensive, let’s analyze the given information step by step:\\n\\n### Given Statements:\\n1. **Oranges are more expensive than mangoes.**  \\n   → **Oranges > Mangoes**\\n\\n2. **Plums are the cheapest.**  \\n   → **Plums < All others**\\n\\n3. **Apples are less expensive than mangoes.**  \\n   → **Apples < Mangoes**\\n\\n4. **Pears are more expensive than oranges.**  \\n   → **Pears > Oranges**\\n\\n---\\n\\n### Step-by-Step Ordering:\\n\\nFrom the above, we can construct a logical hierarchy of price:\\n\\n- Since **plums are the cheapest**, they are at the bottom.\\n- **Apples < Mangoes** and **Oranges > Mangoes**, so the order is:  \\n  **Oranges > Mangoes > Apples**\\n- Additionally, **Pears > Oranges**, so adding that, we get:  \\n  **Pears > Oranges > Mangoes > Apples**\\n- **Plums** are the cheapest, so they are at the bottom of this chain.\\n\\nSo the complete ordering from most to least expensive is:\\n\\n**Pears > Oranges > Mangoes > Apples > Plums**\\n\\n---\\n\\n### Final Evaluation of Options:\\n\\n- **(A) The apples are the most expensive** → Incorrect. Apples are the cheapest among the non-plum group.\\n- **(B) The mangoes are the most expensive** → Incorrect. Oranges and pears are more expensive than mangoes.\\n- **(C) The oranges are the most expensive** → Incorrect. Pears are more expensive than oranges.\\n- **(D) The plums are the most expensive** → Incorrect. Plums are the cheapest.\\n- **(E) The pears are the most expensive** → **Correct.** Pears are at the top of the price order.\\n\\n---\\n\\n### Final Answer:\\n\\n$$\\n\\\\boxed{E}\\n$$\",\n",
       " 'ground_truth': 'E',\n",
       " 'score': {'score': 4.0, 'soft_format': 0.25, 'hard_format': 0.25},\n",
       " 'difficulty': None,\n",
       " 'reason': \"score: {'score': 4.0, 'soft_format': 0.25, 'hard_format': 0.25}\",\n",
       " 'final_reward': 1}"
      ]
     },
     "execution_count": 189,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "second_method_data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "id": "ead5a0ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bbh 0.8467139479905438 10575\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "accuracy_by_source = {}\n",
    "for line in Baseline_model_data:\n",
    "    if line['data_source'] not in accuracy_by_source:\n",
    "        accuracy_by_source[line['data_source']] = []\n",
    "    accuracy_by_source[line['data_source']].append(line['final_reward'])\n",
    "    \n",
    "for source in accuracy_by_source:\n",
    "    mean_accuracy = np.mean(accuracy_by_source[source])\n",
    "    print(source, mean_accuracy, len(accuracy_by_source[source]))\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "id": "c78bcf68",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from math_verify.errors import TimeoutException\n",
    "from math_verify.metric import math_metric\n",
    "from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig\n",
    "\n",
    "def clean_prediction(model_output: str) -> str:\n",
    "    \"\"\"\n",
    "    Try to normalize model outputs like 'Answer: C. Cotton' into '\\boxed{C}'.\n",
    "    \"\"\"\n",
    "\n",
    "    # If boxed answer is already there, keep it\n",
    "    boxed_match = re.search(r'\\\\boxed\\{([A-Za-z0-9]+)\\}', model_output)\n",
    "    if boxed_match:\n",
    "        return f\"\\\\boxed{{{boxed_match.group(1)}}}\"\n",
    "\n",
    "    # Otherwise, try to find a single-letter multiple-choice answer\n",
    "    choice_match = re.search(r'\\b([A-E])\\b', model_output)\n",
    "    if choice_match:\n",
    "        return f\"\\\\boxed{{{choice_match.group(1)}}}\"\n",
    "\n",
    "    # If nothing found, just return as-is\n",
    "    return model_output.strip()\n",
    "\n",
    "def getRawCorrectness(model_output: str, ground_truth: str) -> float:\n",
    "    verify_func = math_metric(\n",
    "        gold_extraction_target=(LatexExtractionConfig(),),\n",
    "        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),\n",
    "    )\n",
    "\n",
    "    ret_score = 0.0\n",
    "    ground_truth_boxed = f\"\\\\boxed{{{ground_truth}}}\"\n",
    "\n",
    "    try:\n",
    "        pred_clean = clean_prediction(model_output)\n",
    "        ret_score, _ = verify_func([ground_truth_boxed], [pred_clean])\n",
    "    except Exception:\n",
    "        pass\n",
    "\n",
    "    return ret_score\n",
    "\n",
    "def getCorrectness(model_output: str, ground_truth: str) -> float:\n",
    "    if '</think>' in model_output:\n",
    "        model_output = model_output.split('</think>')[-1]\n",
    "\n",
    "    verify_func = math_metric(\n",
    "        gold_extraction_target=(LatexExtractionConfig(),),\n",
    "        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),\n",
    "    )\n",
    "\n",
    "    ret_score = 0.0\n",
    "    ground_truth_boxed = f\"\\\\boxed{{{ground_truth}}}\"\n",
    "\n",
    "    try:\n",
    "        pred_clean = clean_prediction(model_output)\n",
    "        ret_score, _ = verify_func([ground_truth_boxed], [pred_clean])\n",
    "    except Exception:\n",
    "        pass\n",
    "\n",
    "    return ret_score\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "id": "1e19a2fc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10575/10575 [00:38<00:00, 273.84it/s]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "correctness_response_length_list_baseline = {}\n",
    "from tqdm import tqdm\n",
    "for line in tqdm(Baseline_model_data):\n",
    "    if line['data_source'] not in correctness_response_length_list_baseline:\n",
    "        correctness_response_length_list_baseline[line['data_source']] = []\n",
    "    correctness_response_length_list_baseline[line['data_source']].append((getCorrectness(line['response'], line['ground_truth']), len(tokenizer.encode(line['response']))))\n",
    "    \n",
    "            \n",
    "            \n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "id": "433bf280",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10575/10575 [00:29<00:00, 352.65it/s]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "correctness_response_length_list_second_method = {}\n",
    "from tqdm import tqdm\n",
    "for line in tqdm(second_method_data):\n",
    "    if line['data_source'] not in correctness_response_length_list_second_method:\n",
    "        correctness_response_length_list_second_method[line['data_source']] = []\n",
    "    correctness_response_length_list_second_method[line['data_source']].append((getCorrectness(line['response'], line['ground_truth']), len(tokenizer.encode(line['response']))))\n",
    "    \n",
    "            \n",
    "            \n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "id": "ceb21c3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def evaluate_methods(baseline, new):\n",
    "    \"\"\"\n",
    "    Compute relative accuracy and compression ratio between baseline and new method.\n",
    "\n",
    "    Args:\n",
    "        baseline (list[tuple]): (accuracy, length) for baseline method\n",
    "        new (list[tuple]): (accuracy, length) for new method\n",
    "\n",
    "    Returns:\n",
    "        dict: results with relative accuracy and compression ratio\n",
    "    \"\"\"\n",
    "    # Unpack\n",
    "    base_acc = [a for a, _ in baseline]\n",
    "    base_len = [l for _, l in baseline]\n",
    "    new_acc = [a for a, _ in new]\n",
    "    new_len = [l for _, l in new]\n",
    "\n",
    "    # Average accuracy and lengths\n",
    "    avg_base_acc = sum(base_acc) / len(base_acc)\n",
    "    avg_new_acc = sum(new_acc) / len(new_acc)\n",
    "    avg_base_len = sum(base_len) / len(base_len)\n",
    "    avg_new_len = sum(new_len) / len(new_len)\n",
    "\n",
    "    # Accuracy difference\n",
    "    diff_acc = avg_new_acc - avg_base_acc\n",
    "\n",
    "    # Compression ratio (new/baseline)\n",
    "    comp_ratio = avg_new_len / avg_base_len if avg_base_len != 0 else float('inf')\n",
    "\n",
    "    # Percentage reduction in length\n",
    "    comp_reduction = (1 - comp_ratio) * 100\n",
    "\n",
    "    return {\n",
    "        \"Baseline Avg Accuracy\": avg_base_acc,\n",
    "        \"New Avg Accuracy\": avg_new_acc,\n",
    "        \"Difference in Accuracy\": diff_acc * 100,\n",
    "        \"Baseline Avg Length\": avg_base_len,\n",
    "        \"New Avg Length\": avg_new_len,\n",
    "        \"Compression Ratio\": round(comp_ratio, 4),\n",
    "        \"Compression Reduction (%)\": round(comp_reduction, 2)\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 194,
   "id": "66507947",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data Source:  bbh\n",
      "Baseline Model:  0.8243026004728132\n",
      "Second Method:  0.8304491725768322\n",
      "Difference in Accuracy: 0.6146572104018921\n",
      "Compression Reduction (%) 24.92\n",
      "--------------------------------\n"
     ]
    }
   ],
   "source": [
    "for key in correctness_response_length_list_baseline:\n",
    "    print(\"Data Source: \", key)\n",
    "    compression = evaluate_methods(correctness_response_length_list_baseline[key], correctness_response_length_list_second_method[key])\n",
    "    print(\"Baseline Model: \", compression[\"Baseline Avg Accuracy\"])\n",
    "    print(\"Second Method: \", compression[\"New Avg Accuracy\"])\n",
    "    print(\"Difference in Accuracy:\", compression[\"Difference in Accuracy\"])\n",
    "    # print(\"Baseline Avg Length: \", compression[\"Baseline Avg Length\"])\n",
    "    # print(\"New Avg Length: \", compression[\"New Avg Length\"])\n",
    "    # print(\"Compression Ratio: \", compression[\"Compression Ratio\"])\n",
    "    print(\"Compression Reduction (%)\", compression[\"Compression Reduction (%)\"])\n",
    "   \n",
    "    print(\"--------------------------------\")\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "id": "22c5215e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_aucoaa(data):\n",
    "    # data is a list of (correctness, response_length) tuples\n",
    "    n = len(data)\n",
    "    tmax = max(length for _, length in data)\n",
    "    oaa_t_list = []\n",
    "    for t in range(tmax + 1):\n",
    "        filtered = [correct for correct, length in data if length < t]\n",
    "        if filtered:\n",
    "            oaa_t = sum(filtered) / n  # Denominator is always n, per definition\n",
    "        else:\n",
    "            oaa_t = 0.0\n",
    "        oaa_t_list.append(oaa_t)\n",
    "    aucoaa = sum(oaa_t_list) / len(oaa_t_list)\n",
    "    return aucoaa\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ea835a98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data Source:  math-amc\n",
      "Accuracy:  0.7951807228915663\n",
      "Response Length:  4156.530120481928\n",
      "Number of Samples:  415\n",
      "AUC-OAA:  0.5376238678997872\n",
      "--------------------------------\n",
      "Data Source:  gpqa\n",
      "Accuracy:  0.4720812182741117\n",
      "Response Length:  4193.119796954315\n",
      "Number of Samples:  985\n",
      "AUC-OAA:  0.30289823809497224\n",
      "--------------------------------\n",
      "Data Source:  math-aime\n",
      "Accuracy:  0.45454545454545453\n",
      "Response Length:  6651.779545454546\n",
      "Number of Samples:  440\n",
      "AUC-OAA:  0.24861373632365236\n",
      "--------------------------------\n"
     ]
    }
   ],
   "source": [
    "for key in correctness_response_length_list.keys():\n",
    "    print(\"Data Source: \", key)\n",
    "    print(\"Accuracy: \", np.mean([correct for correct, _ in correctness_response_length_list[key]]))\n",
    "    print(\"Response Length: \", np.mean([length for _, length in correctness_response_length_list[key]]))\n",
    "    print(\"Number of Samples: \", len(correctness_response_length_list[key]))\n",
    "    print(\"AUC-OAA: \", compute_aucoaa(correctness_response_length_list[key]))\n",
    "    print(\"--------------------------------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "7828a9c2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<think>\\nOkay, so I need to figure out which of the given options A-J have the correct adjective order based on the example sentences provided. The user mentioned that in this variant of English, the adjectives follow a specific order. From the examples given, I need to determine what that order is.\\n\\nLooking at the example sentences, like \"ancient key\" (1), \"pyramidal earring\" (2), \"repulsive normal-size hammer\" (3), etc., I need to find a pattern in the adjectives. The correct adjective order in these examples probably follows a certain sequence, such as opinion, size, age, material, purpose, and then other descriptors.\\n\\nLet me try to deduce the order based on the examples. For instance, in (3) \"repulsive normal-size hammer\", the adjectives are \"repulsive\" (opinion), \"normal-size\" (size), and then \"hammer\" (noun). Similarly, (4) \"red snorkeling bag\" has \"red\" (color), \"snorkeling\" (purpose), then \"bag\". Wait, but that seems different. Wait, maybe the order is opinion, size, age, material, purpose, then other descriptors.\\n\\nWait, looking at example (1): \"ancient key\" – ancient is an adjective of age, then key. But in example (2): \"pyramidal earring\" – pyramidal is a shape descriptor. Maybe the order is opinion, size, age, material, purpose, and then other adjectives. Wait, but how do they fit?\\n\\nAlternatively, perhaps the order is:\\n\\n1. Opinion adjectives (e.g., repulsive, awful, nice, terrible)\\n2. Size adjectives (e.g., large, small, normal-size, medium-size)\\n3. Age adjectives (e.g., ancient, old, old-fashioned)\\n4. Material adjectives (e.g., steel, leather, wool)\\n5. Purpose adjectives (e.g., snorkeling, eating, driving)\\n6. Shape adjectives (e.g., pyramidal, triangular, spherical)\\n\\nBut looking at the example (3): \"repulsive normal-size hammer\" – the order is opinion (repulsive), size (normal-size), then the noun. So maybe that\\'s correct. Then in example (4): \"red snorkeling bag\" – red is a color, which is a material or maybe a different category. Wait, but the example (2): \"pyramidal earring\" – pyramidal is a shape, which is a different category. So maybe the order is:\\n\\nThe adjectives in the correct order are: opinion, size, age, material, purpose, shape.\\n\\nBut I need to check which examples fit this.\\n\\nLooking at example (3): repulsive (opinion), normal-size (size), hammer (noun). So that fits opinion, size.\\n\\nExample (4): red (color, which might be a material or a different category). But maybe color is considered a material? Hmm. Or maybe the order is opinion, size, age, material, purpose, shape. Wait, example (5): square wrench – square is a shape, so maybe that\\'s after purpose. Wait, in example (5), \"square wrench\" – the adjective is shape (square), noun. So perhaps the order is opinion, size, age, material, purpose, shape. But if that\\'s the case, then in the example (5), the adjective is a shape, which would be last. But then example (2) is \"pyramidal earring\" – pyramidal is shape, so it\\'s last. But in that example, the order is shape, then noun. But according to the examples given, the adjectives are placed before the noun, but the order of the adjectives is unclear.\\n\\nWait, looking at the example (1): \"ancient key\" – the adjective is \"ancient\", which is age. So that would be age. Then in example (2): \"pyramidal earring\" – shape. So maybe the order is: opinion, age, size, material, purpose, shape.\\n\\nBut example (3): \"repulsive normal-size hammer\" – opinion, size. Then in example (4): \"red snorkeling bag\" – red is color (maybe a material?), snorkeling is purpose. Then \"bag\".\\n\\nAnother example (7): \"nigerian box\" – \"nigerian\" is nationality, which may be a material? Or maybe not. Wait, but in the given examples, some adjectives are not in the standard order. For instance, \"pyramidal earring\" – pyramidal is shape, so maybe shape is after purpose? Or perhaps the correct order is opinion, size, age, material, purpose, shape. Let me check if that holds.\\n\\nBut looking at (4): \"red snorkeling bag\" – red (color), snorkeling (purpose). But if the order is opinion, size, age, material, purpose, shape, then \"red\" would be a material, but red is color, which is a different category. Hmm.\\n\\nAlternatively, the correct order is:\\n\\nOpinion, size, age, material, purpose, shape.\\n\\nSo for example:\\n\\n(3) repulsive (opinion) normal-size (size) hammer (noun). Correct.\\n\\n(4) red (material?) snorkeling (purpose) bag. But red is color. Maybe that\\'s considered a material? Maybe not. Alternatively, the order could be:\\n\\nOpinion, size, age, shape, material, purpose.\\n\\nBut this is getting confusing. Let\\'s look at example (4): \"red snorkeling bag\" – the adjectives are \"red\" and \"snorkeling\". If the correct order is opinion, size, age, shape, material, purpose, but that doesn\\'t fit here.\\n\\nAlternatively, perhaps the adjectives are ordered as:\\n\\nOpinion (like repulsive, awful), then size (like normal-size), then age (like ancient), then material (like steel), then purpose (like snorkeling), then shape (like pyramidal).\\n\\nSo for example (3): repulsive (opinion), normal-size (size), hammer (noun) – which fits.\\n\\nIn (4): red (color, perhaps considered a material?), snorkeling (purpose), bag. But maybe red is a color, which is a different category. However, in the example (3), the adjectives are opinion and size. So maybe the order is opinion, size, age, material, purpose, shape.\\n\\nBut then in example (4), the adjectives are red (color, maybe material) and snorkeling (purpose). So that would be material (red) and purpose (snorkeling). But according to the example, this is correct. But this might not fit the order.\\n\\nAlternatively, perhaps the adjectives are ordered as: opinion, size, age, material, purpose, shape.\\n\\nSo let\\'s check the example (2): pyramidal earring. Here, \"pyramidal\" is a shape, which would be the last. So if the correct order is opinion, size, age, material, purpose, shape, then pyramidal would be the last adjective. But in this case, \"pyramidal\" is first, which might not fit.\\n\\nAlternatively, maybe the order is: opinion, shape, size, age, material, purpose.\\n\\nBut in that case, example (2) would fit. But example (3) is repulsive (opinion), normal-size (size), so that would be first two. Then the noun.\\n\\nHmm, this is complicated. Maybe the correct order is that opinion, size, age, shape, material, purpose. Or some other order.\\n\\nAlternatively, looking at example (2): \"pyramidal earring\" – the adjective is shape. So perhaps the correct order is that the adjectives are arranged in a specific order, and the examples provided show that the order is: opinion, size, age, material, purpose, shape.\\n\\nWait, example (2) is pyramidal, which is shape. If the correct order is that shape comes after purpose, then that would not fit. But maybe the order is different.\\n\\nAlternatively, perhaps the adjectives in the examples are ordered in the following way:\\n\\n- Opinion (like repulsive, awful, nice, etc.)\\n- Size (like large, small, normal-size, etc.)\\n- Age (like ancient, old, old-fashioned)\\n- Material (like steel, leather, wool)\\n- Purpose (like snorkeling, eating, driving)\\n- Shape (like pyramidal, triangular, spherical)\\n\\nBut in the example (3), the adjectives are repulsive (opinion) and normal-size (size), then the noun. So that\\'s correct.\\n\\nIn example (4): red (color) is a material? Or maybe that\\'s a different category. However, in the example (4), \"red snorkeling bag\" is given as correct. So red is a color, and snorkeling is a purpose. But according to the order above, the correct order would be opinion, size, age, material, purpose, shape. But in example (4), the adjectives are red (color) and snorkeling (purpose). However, since red is a color, maybe that is considered a material.\\n\\nAlternatively, the order might be opinion, size, age, shape, material, purpose. But I\\'m not sure.\\n\\nAlternatively, maybe the correct order is: opinion, size, age, material, purpose, shape.\\n\\nBut how to check? Let\\'s look at example (1): ancient key – ancient (age), key (noun). So that\\'s age first. But in the example (3), the adjectives are opinion and size. So the order is opinion, size, age, etc. That seems inconsistent.\\n\\nAlternatively, maybe the correct order is:\\n\\nOpinion (like repulsive, awful, nice, etc.)\\n\\nSize (like large, small, normal-size)\\n\\nAge (ancient, old, old-fashioned)\\n\\nMaterial (steel, leather, wool)\\n\\nPurpose (snorkeling, eating, driving)\\n\\nShape (pyramidal, triangular, spherical)\\n\\nSo in that order, the adjectives should be ordered as opinion, size, age, material, purpose, shape. Then, in example (2) \"pyramidal earring\" would have the adjective as shape, which would be last. But in that example, the adjective is first. So that would not fit.\\n\\nAlternatively, maybe the correct order is: opinion, size, age, material, purpose, shape. But in example (2), the adjective is shape, so that would be placed at the end, but in the example, it\\'s first. So this is conflicting.\\n\\nAlternatively, maybe the adjectives are ordered based on their category, and the correct order is:\\n\\n1. Opinion (e.g., repulsive, awful, nice, terrible)\\n2. Size (e.g., large, small, normal-size)\\n3. Age (e.g., ancient, old, old-fashioned)\\n4. Shape (e.g., pyramidal, triangular, spherical)\\n5. Material (e.g., steel, leather, wool)\\n6. Purpose (e.g., snorkeling, eating, driving)\\n\\nSo, for example, (2) \"pyramidal earring\" would have shape as the first adjective. Then in (3) \"repulsive normal-size hammer\" would be opinion, size.\\n\\nThis might fit. Let\\'s check example (2): \"pyramidal earring\" – shape, then noun. That would fit if shape is first. So the order is shape first.\\n\\nBut in example (3), the adjectives are opinion and size. So that would be opinion first, then size. So this order would not be consistent.\\n\\nAlternatively, the correct order is that the adjectives are ordered as: opinion, size, age, material, purpose, shape. But example (2) has shape first, which would not fit.\\n\\nAlternatively, maybe the correct order is: shape, material, purpose, size, age, opinion. But this is just a guess.\\n\\nThis is really confusing. Maybe I should check the given examples again for any patterns.\\n\\nLooking at example (4): \"red snorkeling bag\" – red (color, material?), snorkeling (purpose), bag.\\n\\nExample (5): \"square wrench\" – square (shape), wrench.\\n\\nExample (6): \"awful large apple\" – awful (opinion), large (size), apple.\\n\\nSo in example (6), the adjectives are opinion and size.\\n\\nIn example (7): \"nigerian box\" – \"nigerian\" is nationality, which is a material? Or maybe not. So perhaps in this case, it\\'s a material.\\n\\nBut the problem is that the correct order is not clear. Maybe the answer is to check if the adjectives in the options are in the order that is consistent with the given examples.\\n\\nBut since the examples are given, perhaps the correct adjective order is as follows, based on the examples:\\n\\nThe order is opinion, size, age, material, purpose, shape.\\n\\nSo, in the given examples:\\n\\n(1) ancient key – ancient (age), key.\\n\\nBut that would be age, which is first. So that might not fit. But maybe the order is age, then opinion?\\n\\nAlternatively, the correct order is opinion, size, age, material, purpose, shape.\\n\\nBut example (1) has \"ancient\" (age), so that would be the third position. But example (6) is \"awful large apple\" – opinion (awful), size (large), then apple.\\n\\nSo that fits the order opinion, size.\\n\\nIn example (2): pyramidal earring – shape, then noun. So shape is the last. So according to the order, if the order is opinion, size, age, material, purpose, shape, then shape should be last. So that would fit.\\n\\nSo for example (2), the adjective is shape, so it\\'s last. That would fit.\\n\\nSo maybe the correct order is: opinion, size, age, material, purpose, shape.\\n\\nBut in example (1), the adjective is \"ancient\", which is age. So in this case, the adjective is age. So that would be third.\\n\\nBut in the example (1), it\\'s \"ancient key\" – so the adjective is age. So that would fit in the third position.\\n\\nIn example (2): pyramidal earring – shape is last.\\n\\nIn example (3): \"repulsive normal-size hammer\" – opinion, then size.\\n\\nSo that fits the first two positions.\\n\\nIn example (4): red snorkeling bag – red (color, which might be material), snorkeling (purpose). So according to the order, material should come before purpose. But red is color, which is a material. Then snorkeling is purpose. So the order would be material (red), then purpose (snorkeling). So that fits.\\n\\nExample (5): square wrench – shape (square) is last.\\n\\nExample (6): awful large apple – opinion, size.\\n\\nSo the order seems to be opinion, size, age, material, purpose, shape.\\n\\nTherefore, in the options given, each option should have adjectives in that order.\\n\\nSo let\\'s analyze each option:\\n\\n(A) iranian square violet smoking old-fashioned rubber repulsive large screwdriver\\n\\nLet\\'s break down the adjectives:\\n\\n- iranian (nationality, maybe material?)\\n- square (shape)\\n- violet (color, material)\\n- smoking (purpose)\\n- old-fashioned (age)\\n- rubber (material)\\n- repulsive (opinion)\\n- large (size)\\n- screwdriver\\n\\nWait, but according to the order opinion, size, age, material, purpose, shape, this should be:\\n\\nOpinion: repulsive\\nSize: large\\nAge: old-fashioned\\nMaterial: rubber, violet (color)\\nPurpose: smoking\\nShape: square\\n\\nBut in the option, the order is: iranian, square, violet, smoking, old-fashioned, rubber, repulsive, large, screwdriver.\\n\\nThis is not in the correct order. Because the opinion should be first, then size, etc.\\n\\nSo the correct order for this would be repulsive (opinion), large (size), old-fashioned (age), rubber (material), smoking (purpose), square (shape), and then the noun. But in the option, the order is different.\\n\\nTherefore, option A is incorrect.\\n\\n(B) mysterious little maroon whittling ancient cloth prismlike brush\\n\\nAdjectives: mysterious (opinion), little (size), maroon (color/material), whittling (purpose), ancient (age), cloth (material), prismlike (shape)\\n\\nAccording to the order, the correct sequence would be opinion (mysterious), size (little), age (ancient), material (cloth), purpose (whittling), shape (prismlike), and then the noun. But in this option, the order is:\\n\\nmysterious (opinion), little (size), maroon (color, material), whittling (purpose), ancient (age), cloth (material), prismlike (shape). But here, the age comes after the purpose. That\\'s wrong. The correct order would be opinion, size, age, material, purpose, shape. In this case, the age (ancient) is after the purpose (whittling), which is incorrect. So option B is incorrect.\\n\\n(C) silly midsize crimson eating archaic leather square brazilian banana\\n\\nAdjectives: silly (opinion), midsize (size), crimson (color/material), eating (purpose), archaic (age), leather (material), square (shape), brazilian (nationality/material)\\n\\nCorrect order should be opinion (silly), size (midsize), age (archaic), material (leather, crimson, brazilian?), purpose (eating), shape (square). But in this option, after size (midsize), we have crimson (color), then eating (purpose), then archaic (age), which is incorrect. Because age should come before purpose. So the order is incorrect. So option C is incorrect.\\n\\n(D) tiny repulsive maroon drinking ancient glass pyramidal congolese fork\\n\\nAdjectives: tiny (size), repulsive (opinion), maroon (color/material), drinking (purpose), ancient (age), glass (material), pyramidal (shape), congolese (nationality)\\n\\nCorrect order is opinion (repulsive), size (tiny), age (ancient), material (glass), purpose (drinking), shape (pyramidal), and then nationality. But in the option, the order is size (tiny), opinion (repulsive), which is incorrect. The opinion should come before size. So this is wrong. Thus, D is incorrect.\\n\\n(E) green snorkeling archaic iron large awful toolbox\\n\\nAdjectives: green (color/material), snorkeling (purpose), archaic (age), iron (material), large (size), awful (opinion), toolbox.\\n\\nAccording to the order, opinion should be first, then size, then age, then material, then purpose. But here, the order is green (color/material), snorkeling (purpose), archaic (age), iron (material), large (size), awful (opinion). This is incorrect because opinion (awful) is last. So E is incorrect.\\n\\n(F) filipino spherical yellow exercise ancient wool medium-size good calculator\\n\\nAdjectives: filipino (nationality), spherical (shape), yellow (color), exercise (purpose), ancient (age), wool (material), medium-size (size), good (opinion), calculator.\\n\\nCorrect order: opinion (good), size (medium-size), age (ancient), material (wool), purpose (exercise), shape (spherical), then nationality. But in this option, the order is filipino, spherical, yellow, exercise, ancient, wool, medium-size, good. So the opinion (good) is last, which is wrong. Also, size (medium-size) is after material (wool), which is incorrect. So F is incorrect.\\n\\n(G) triangular violet drinking brand-new fiberglass small repulsive watch\\n\\nAdjectives: triangular (shape), violet (color/material), drinking (purpose), brand-new (age), fiberglass (material), small (size), repulsive (opinion), watch.\\n\\nThe order should be opinion (repulsive), size (small), age (brand-new), material (fiberglass), purpose (drinking), shape (triangular). But in the option, it\\'s triangular, violet, drinking, brand-new, fiberglass, small, repulsive. This is incorrect because the opinion is last. So G is incorrect.\\n\\n(H) russian prismlike aqua snorkeling old cloth house\\n\\nAdjectives: russian (nationality), prismlike (shape), aqua (color), snorkeling (purpose), old (age), cloth (material), house.\\n\\nThe correct order should be opinion (none here), then size (none), age (old), material (cloth), purpose (snorkeling), shape (prismlike). But the order is different. So this is incorrect. Also, \\'russian\\' is nationality, which might not fit the order.\\n\\n(I) massive lovely gray walking new cloth pen\\n\\nAdjectives: massive (size), lovely (opinion), gray (color), walking (purpose), new (age), cloth (material), pen.\\n\\nThe correct order would be opinion (lovely), size (massive), age (new), material (cloth), purpose (walking). But in this option, the order is size (massive), opinion (lovely), which is incorrect. Because opinion should come before size. So I is incorrect.\\n\\n(J) wonderful tiny gray walking archaic wood brush\\n\\nAdjectives: wonderful (opinion), tiny (size), gray (color), walking (purpose), archaic (age), wood (material), brush.\\n\\nThe order should be opinion (wonderful), size (tiny), age (archaic), material (wood), purpose (walking). The order here is correct. So J is correct.\\n\\nWait, let me check:\\n\\nThe adjectives are wonderful (opinion), tiny (size), gray (color, material), walking (purpose), archaic (age), wood (material), brush.\\n\\nBut the correct order should be opinion, size, age, material, purpose, shape. Wait, but in this case, the adjectives are:\\n\\nopinion: wonderful\\n\\nsize: tiny\\n\\nage: archaic\\n\\nmaterial: wood, gray?\\n\\nWait, gray is a color, which is part of material or color. In the correct order, after age comes material. So here, the order is:\\n\\nwonderful (opinion), tiny (size), archaic (age), gray (color, material), walking (purpose), wood (material), brush.\\n\\nBut in the option, the order is wonderful, tiny, gray, walking, archaic, wood. So the age (archaic) is after walking (purpose), which is incorrect. So J is incorrect.\\n\\nWait, no. Let\\'s check again. The option is:\\n\\nwonderful tiny gray walking archaic wood brush.\\n\\nThe adjectives are: wonderful (opinion), tiny (size), gray (color), walking (purpose), archaic (age), wood (material), brush.\\n\\nAccording to the correct order: opinion, size, age, material, purpose, shape.\\n\\nBut in this case, the order is opinion, size, then gray (color, which is material), walking (purpose), archaic (age), wood (material). Wait, this is incorrect because age should come before purpose. So the correct order would be opinion, size, age, material, purpose, shape. But in this option, the age (archaic) comes after the purpose (walking), which is wrong.\\n\\nSo J is also incorrect.\\n\\nWait, but the correct order is opinion, size, age, material, purpose, shape. So if the adjectives are arranged in that order, then J is incorrect. But if the order is different, perhaps I\\'m missing something.\\n\\nAlternatively, maybe the order is opinion, size, material, age, purpose, shape. But that\\'s not matching the examples.\\n\\nAlternatively, maybe the correct order is: opinion, size, age, purpose, material, shape.\\n\\nBut example (3) has opinion, size, then the noun. So that\\'s correct. But this is confusing.\\n\\nGiven the confusion, perhaps the correct answer is only option J, but I\\'m not sure. Alternatively, maybe none of them are correct. But let me check again.\\n\\nOption J is:\\n\\nwonderful (opinion), tiny (size), gray (color), walking (purpose), archaic (age), wood (material), brush.\\n\\nThe order would be opinion, size, purpose, age, material. That\\'s incorrect. So J is incorrect.\\n\\nAlternatively, maybe the correct order is opinion, size, material, purpose, age, shape. But that\\'s not matching any examples.\\n\\nGiven the confusion, maybe none of the options are correct. So the answer is K.\\n\\nBut let me check each option again more carefully.\\n\\nFor example, let\\'s recheck option J:\\n\\nwonderful (opinion), tiny (size), gray (color, material), walking (purpose), archaic (age), wood (material), brush.\\n\\nThe correct order should be opinion, size, age, material, purpose, shape. So here, the age (archaic) is after the purpose (walking), which is incorrect. So option J is wrong.\\n\\nLooking at all options, none seem to fit the correct order. Therefore, the answer is K.\\n\\nBut wait, let me check option (J) again. Maybe the age is supposed to come before purpose. So if the adjectives are in the order opinion, size, age, material, purpose, shape, then the correct order is:\\n\\nopinion, size, age, material, purpose, shape.\\n\\nIn option J, the order is: opinion (wonderful), size (tiny), then gray (color/material), then walking (purpose), then archaic (age), then wood (material). That would be opinion, size, purpose, age, material. Which is wrong.\\n\\nSo none of the options are correct. Therefore, the answer is K.\\n</think>\\n\\nTo determine which of the given options use the correct adjective order in this variant of English, we need to refer to the pattern in the examples provided. Based on the examples, the correct adjective order appears to be:\\n\\n1. **Opinion adjectives** (e.g., *wonderful, awful, nice, repulsive*)\\n2. **Size adjectives** (e.g., *large, small, normal-size, medium-size*)\\n3. **Age adjectives** (e.g., *ancient, old, old-fashioned, new*)\\n4. **Material adjectives** (e.g., *steel, leather, wood, cloth*)\\n5. **Purpose adjectives** (e.g., *snorkeling, eating, driving, walking*)\\n6. **Shape adjectives** (e.g., *pyramidal, triangular, spherical*)\\n\\nThis order is consistent across the examples provided. It starts with **opinion**, followed by **size**, then **age**, then **material**, then **purpose**, and finally **shape**.\\n\\n---\\n\\nLet\\'s analyze the options based on this order:\\n\\n### (A) iranian square violet smoking old-fashioned rubber repulsive large screwdriver  \\n- Adjectives: *iranian (nationality, likely material), square (shape), violet (color, likely material), smoking (purpose), old-fashioned (age), rubber (material), repulsive (opinion), large (size), screwdriver*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes last. Incorrect.\\n\\n---\\n\\n### (B) mysterious little maroon whittling ancient cloth prismlike brush  \\n- Adjectives: *mysterious (opinion), little (size), maroon (color, material), whittling (purpose), ancient (age), cloth (material), prismlike (shape), brush*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **age** comes after **purpose**. Incorrect.\\n\\n---\\n\\n### (C) silly midsize crimson eating archaic leather square brazilian banana  \\n- Adjectives: *silly (opinion), midsize (size), crimson (color, material), eating (purpose), archaic (age), leather (material), square (shape), brazilian (nationality)*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **age** comes after **purpose**. Incorrect.\\n\\n---\\n\\n### (D) tiny repulsive maroon drinking ancient glass pyramidal congolese fork  \\n- Adjectives: *tiny (size), repulsive (opinion), maroon (color, material), drinking (purpose), ancient (age), glass (material), pyramidal (shape), congolese (nationality)*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes after **size**. Incorrect.\\n\\n---\\n\\n### (E) green snorkeling archaic iron large awful toolbox  \\n- Adjectives: *green (color, material), snorkeling (purpose), archaic (age), iron (material), large (size), awful (opinion), toolbox*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes last. Incorrect.\\n\\n---\\n\\n### (F) filipino spherical yellow exercise ancient wool medium-size good calculator  \\n- Adjectives: *filipino (nationality), spherical (shape), yellow (color, material), exercise (purpose), ancient (age), wool (material), medium-size (size), good (opinion), calculator*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes last. Incorrect.\\n\\n---\\n\\n### (G) triangular violet drinking brand-new fiberglass small repulsive watch  \\n- Adjectives: *triangular (shape), violet (color, material), drinking (purpose), brand-new (age), fiberglass (material), small (size), repulsive (opinion), watch*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes last. Incorrect.\\n\\n---\\n\\n### (H) russian prismlike aqua snorkeling old cloth house  \\n- Adjectives: *russian (nationality), prismlike (shape), aqua (color, material), snorkeling (purpose), old (age), cloth (material), house*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** is missing and the rest are in incorrect order. Incorrect.\\n\\n---\\n\\n### (I) massive lovely gray walking new cloth pen  \\n- Adjectives: *massive (size), lovely (opinion), gray (color, material), walking (purpose), new (age), cloth (material), pen*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes after **size**. Incorrect.\\n\\n---\\n\\n### (J) wonderful tiny gray walking archaic wood brush  \\n- Adjectives: *wonderful (opinion), tiny (size), gray (color, material), walking (purpose), archaic (age), wood (material), brush*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **age** comes after **purpose**. Incorrect.\\n\\n---\\n\\n### Conclusion:\\n\\nNone of the options (A–J) follow the correct adjective order as deduced from the examples. Therefore, the correct answer is:\\n\\n$$\\n\\\\boxed{K}\\n$$'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "line['response']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fa28c910",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4407"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tokenizer.encode(line['response']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "d81e6485",
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'your_file.jsonl'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mFileNotFoundError\u001b[39m                         Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[37]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mjson\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;66;03m## read jsonl\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43myour_file.jsonl\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mr\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m      4\u001b[39m     data = [json.loads(line) \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m f]\n",
      "\u001b[36mFile \u001b[39m\u001b[32mNONEcode/verl-fork/verl/venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py:343\u001b[39m, in \u001b[36m_modified_open\u001b[39m\u001b[34m(file, *args, **kwargs)\u001b[39m\n\u001b[32m    336\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[32m0\u001b[39m, \u001b[32m1\u001b[39m, \u001b[32m2\u001b[39m}:\n\u001b[32m    337\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m    338\u001b[39m         \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mIPython won\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m by default \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    339\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    340\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33myou can use builtins\u001b[39m\u001b[33m'\u001b[39m\u001b[33m open.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    341\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m343\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: 'your_file.jsonl'"
     ]
    }
   ],
   "source": [
    "import json\n",
    "## read jsonl\n",
    "with open('your_file.jsonl', 'r') as f:\n",
    "    data = [json.loads(line) for line in f]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d0055897",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "55f7dbede25c49ef92fd461aeed8a3f9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/2499 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "train_dataset = load_dataset('NONEcode/verl-fork/baselines/LC-R1/dataset', split='train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bd778c0c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['problem', 'solution'],\n",
       "    num_rows: 2499\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b70c8c1f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'problem': 'Let $x$ , $y$ and $z$ all exceed $1$ and let $w$ be a positive number such that $\\\\log_xw=24$ , $\\\\log_y w = 40$ and $\\\\log_{xyz}w=12$ . Find $\\\\log_zw$ .',\n",
       " 'solution': '60'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "14e28692",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data_source</th>\n",
       "      <th>prompt</th>\n",
       "      <th>ability</th>\n",
       "      <th>reward_model</th>\n",
       "      <th>extra_info</th>\n",
       "      <th>uuid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'In triangle $ABC$, $\\sin \\angle ...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '34', 'style': 'rule-lighteva...</td>\n",
       "      <td>{'index': 0, 'split': 'train'}</td>\n",
       "      <td>d2ccd328-e712-4d6b-b909-a7956589db4e</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'Let $ABCD$ be a unit square in t...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '113', 'style': 'rule-lightev...</td>\n",
       "      <td>{'index': 1, 'split': 'train'}</td>\n",
       "      <td>5fb8b773-1e5f-48ab-b5ce-8d2ba6ed53cc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'Let $a, b, c$ be distinct number...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '-3', 'style': 'rule-lighteva...</td>\n",
       "      <td>{'index': 2, 'split': 'train'}</td>\n",
       "      <td>974a3162-fb31-4720-86ab-6a3738ae84e7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'There are $7$ boxes arranged in ...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '3', 'style': 'rule-lighteval...</td>\n",
       "      <td>{'index': 3, 'split': 'train'}</td>\n",
       "      <td>c47003a1-29c7-46ad-b566-977ba9018acb</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'Let $S$ be the set of triples $(...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '37', 'style': 'rule-lighteva...</td>\n",
       "      <td>{'index': 4, 'split': 'train'}</td>\n",
       "      <td>f8e041a5-2848-4dfb-99c4-74aa70c8d211</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     data_source                                             prompt ability  \\\n",
       "0  DAPO-Math-17k  [{'content': 'In triangle $ABC$, $\\sin \\angle ...    math   \n",
       "1  DAPO-Math-17k  [{'content': 'Let $ABCD$ be a unit square in t...    math   \n",
       "2  DAPO-Math-17k  [{'content': 'Let $a, b, c$ be distinct number...    math   \n",
       "3  DAPO-Math-17k  [{'content': 'There are $7$ boxes arranged in ...    math   \n",
       "4  DAPO-Math-17k  [{'content': 'Let $S$ be the set of triples $(...    math   \n",
       "\n",
       "                                        reward_model  \\\n",
       "0  {'ground_truth': '34', 'style': 'rule-lighteva...   \n",
       "1  {'ground_truth': '113', 'style': 'rule-lightev...   \n",
       "2  {'ground_truth': '-3', 'style': 'rule-lighteva...   \n",
       "3  {'ground_truth': '3', 'style': 'rule-lighteval...   \n",
       "4  {'ground_truth': '37', 'style': 'rule-lighteva...   \n",
       "\n",
       "                       extra_info                                  uuid  \n",
       "0  {'index': 0, 'split': 'train'}  d2ccd328-e712-4d6b-b909-a7956589db4e  \n",
       "1  {'index': 1, 'split': 'train'}  5fb8b773-1e5f-48ab-b5ce-8d2ba6ed53cc  \n",
       "2  {'index': 2, 'split': 'train'}  974a3162-fb31-4720-86ab-6a3738ae84e7  \n",
       "3  {'index': 3, 'split': 'train'}  c47003a1-29c7-46ad-b566-977ba9018acb  \n",
       "4  {'index': 4, 'split': 'train'}  f8e041a5-2848-4dfb-99c4-74aa70c8d211  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## load parquet dataset\n",
    "import pandas as pd\n",
    "dataset = pd.read_parquet('NONEcode/verl-fork/verl/scripts/data/dapo-17k/train.parquet')\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "51a71306",
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert this dataset into a dataset\n",
    "from datasets import Dataset\n",
    "dataset = Dataset.from_pandas(dataset)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "322a257a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info', 'uuid'],\n",
       "    num_rows: 20000\n",
       "})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "61da7071",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ab05b6d81e894388baf96b5dbcfc88f3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = load_dataset('parquet', data_files='NONEcode/verl-fork/verl/scripts/data/dapo-17k/train.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "74c16f7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "### remove some columsn from the dataset\n",
    "dataset = dataset.remove_columns(['data_source', 'ability', 'extra_info', 'uuid'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ddacf91f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'prompt': [{'content': \"In triangle $ABC$, $\\\\sin \\\\angle A = \\\\frac{4}{5}$ and $\\\\angle A < 90^\\\\circ$. Let $D$ be a point outside triangle $ABC$ such that $\\\\angle BAD = \\\\angle DAC$ and $\\\\angle BDC = 90^\\\\circ$. Suppose that $AD = 1$ and that $\\\\frac{BD}{CD} = \\\\frac{3}{2}$. If $AB + AC$ can be expressed in the form $\\\\frac{a\\\\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$. Let's think step by step and output the final answer within \\\\boxed{}.\",\n",
       "   'role': 'user'}],\n",
       " 'reward_model': {'ground_truth': '34', 'style': 'rule-lighteval/MATH_v2'}}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset['train'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b06bc9f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset['train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5646a45f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a new column with problem name from prompt\n",
    "dataset = dataset.add_column('problem', [d['prompt'][0]['content'].replace(\"Let's think step by step and output the final answer within \\\\boxed{}.\", '').strip() for d in dataset])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "ab61e1cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset.add_column('solution', [d['reward_model']['ground_truth'] for d in dataset])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "d949b790",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset.remove_columns(['prompt', 'reward_model'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "26564142",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['problem', 'solution'],\n",
       "    num_rows: 20000\n",
       "})"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "868bb2f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "### save the dataset locally in dataset format 'NONEcode/verl-fork/baselines/LC-R1/dapo-17k'\n",
    "from datasets import load_dataset, load_from_disk\n",
    "# dataset.save_to_disk('NONEcode/verl-fork/baselines/LC-R1/dapo-17k')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "93053ad7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the dataset from the local dataset format\n",
    "dataset = load_from_disk('NONEcode/verl-fork/baselines/LC-R1/dapo-17k')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8960bb8b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
