{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2ddc1af8",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_source = ['math-amc', 'math-aime', 'gpqa', 'mip-insufficient', 'mip-question', 'bbh', 'bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_logical_deduction_five_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_snarks', 'underthink-bench', ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "5217088d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "os.environ['HF_HOME'] = 'NONE.cache/huggingface'\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '7'\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
    "from vllm import LLM\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen3-4B\")\n",
    "# Qwen/Qwen3-4B\n",
    "# deepseek-ai/DeepSeek-R1-Distill-Llama-8B\n",
    "# deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\n",
    "# model = LLM(model=\"Qwen/Qwen3-4B\", gpu_memory_utilization=0.95, tensor_parallel_size=1, task=\"embed\")\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "id": "739a2ddb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "data = []\n",
    "with open('NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/Summary/qwen4b_dapo_math_10k_context_linear_reward_attention_no_difficulty/rollout/val_120_v1.jsonl', 'r') as f:\n",
    "    for line in f:\n",
    "        data.append(json.loads(line))\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "id": "ead5a0ce",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bbeh 0.2155511811023622 2032\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "accuracy_by_source = {}\n",
    "for line in data:\n",
    "    if line['data_source'] not in accuracy_by_source:\n",
    "        if 'bbh_' in line['data_source']:\n",
    "            line['data_source'] = 'bbh'\n",
    "        accuracy_by_source[line['data_source']] = []\n",
    "    accuracy_by_source[line['data_source']].append(line['final_reward'])\n",
    "    \n",
    "for source in accuracy_by_source:\n",
    "    mean_accuracy = np.mean(accuracy_by_source[source])\n",
    "    print(source, mean_accuracy, len(accuracy_by_source[source]))\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "id": "cada0243",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import numpy as np\n",
    "# response_length_list = {}\n",
    "# from tqdm import tqdm\n",
    "# for line in tqdm(data):\n",
    "#     if line['data_source'] not in response_length_list:\n",
    "#         response_length_list[line['data_source']] = []\n",
    "#     response_length_list[line['data_source']].append(len(tokenizer.encode(line['response'])))\n",
    "# for source in response_length_list:\n",
    "#     mean_response_length = np.mean(response_length_list[source])\n",
    "#     print(source, mean_response_length)\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "c78bcf68",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "from math_verify.errors import TimeoutException\n",
    "from math_verify.metric import math_metric\n",
    "from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig\n",
    "\n",
    "def clean_prediction(model_output: str) -> str:\n",
    "    \"\"\"\n",
    "    Try to normalize model outputs like 'Answer: C. Cotton' into '\\boxed{C}'.\n",
    "    \"\"\"\n",
    "\n",
    "    # If boxed answer is already there, keep it\n",
    "    boxed_match = re.search(r'\\\\boxed\\{([A-Za-z0-9]+)\\}', model_output)\n",
    "    if boxed_match:\n",
    "        return f\"\\\\boxed{{{boxed_match.group(1)}}}\"\n",
    "\n",
    "    # Otherwise, try to find a single-letter multiple-choice answer\n",
    "    choice_match = re.search(r'\\b([A-E])\\b', model_output)\n",
    "    if choice_match:\n",
    "        return f\"\\\\boxed{{{choice_match.group(1)}}}\"\n",
    "\n",
    "    # If nothing found, just return as-is\n",
    "    return model_output.strip()\n",
    "\n",
    "def getRawCorrectness(model_output: str, ground_truth: str) -> float:\n",
    "    verify_func = math_metric(\n",
    "        gold_extraction_target=(LatexExtractionConfig(),),\n",
    "        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),\n",
    "    )\n",
    "\n",
    "    ret_score = 0.0\n",
    "    ground_truth_boxed = f\"\\\\boxed{{{ground_truth}}}\"\n",
    "\n",
    "    try:\n",
    "        pred_clean = clean_prediction(model_output)\n",
    "        ret_score, _ = verify_func([ground_truth_boxed], [pred_clean])\n",
    "    except Exception:\n",
    "        pass\n",
    "\n",
    "    return ret_score\n",
    "\n",
    "def getCorrectness(model_output: str, ground_truth: str) -> float:\n",
    "    if '</think>' in model_output:\n",
    "        model_output = model_output.split('</think>')[-1]\n",
    "\n",
    "    verify_func = math_metric(\n",
    "        gold_extraction_target=(LatexExtractionConfig(),),\n",
    "        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),\n",
    "    )\n",
    "\n",
    "    ret_score = 0.0\n",
    "    ground_truth_boxed = f\"\\\\boxed{{{ground_truth}}}\"\n",
    "\n",
    "    try:\n",
    "        pred_clean = clean_prediction(model_output)\n",
    "        ret_score, _ = verify_func([ground_truth_boxed], [pred_clean])\n",
    "    except Exception:\n",
    "        pass\n",
    "\n",
    "    return ret_score\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "id": "9444a12d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data_source': 'bbeh',\n",
       " 'prompt': 'user\\nIn a variant of English, we are given that the following sentences have correct adjective order:\\n(1) ancient key (2) pyramidal earring (3) repulsive normal-size hammer (4) red snorkeling bag (5) square wrench (6) awful large apple (7) nigerian box (8) yellow snorkeling bed (9) driving cup (10) snorkeling stapler (11) terrible little banana (12) gray pencil (13) cyan smoking brush (14) brown canvas (15) nice big screwdriver (16) egyptian spoon (17) exercise shirt (18) black eating scissors (19) indian shoe (20) mexican prismlike hammer (21) old-fashioned wool fire extinguisher (22) driving knife (23) silly earring (24) filipino piano (25) leather bird (26) german car (27) ancient sunglasses (28) nice small marker (29) green eating key (30) lovely little bicycle (31) archaic paper plier (32) spherical bottle (33) wonderful large bicycle (34) old-fashioned plastic plier (35) large shoe (36) triangular clock (37) indian rectangular plate (38) lovely huge ring (39) whittling car (40) green wrench (41) terrible bottle (42) indigo ring (43) nice book (44) steel chair (45) pink driving dog (46) typing ruler (47) ancient car (48) indigo candle (49) good normal-size watch (50) little canvas (51) ethiopian car (52) typing toolbox (53) teal hiking bed (54) hiking ring (55) fiberglass knife (56) russian plier (57) yellow marker (58) medium-size stapler (59) ancient hat (60) american circular table (61) vietnamese walking old spoon (62) vietnamese pyramidal ruler (63) repulsive necklace (64) hiking key (65) ancient steel earring (66) large table (67) filipino rectangular speaker (68) pyramidal saw (69) teal bird (70) triangular speaker (71) violet saw (72) archaic wood toolbox (73) midsize jar (74) aqua jar (75) good sofa (76) ancient wood bowl (77) wool wallet (78) pink typing shoe (79) black guitar (80) mexican pyramidal pen (81) new ring (82) old guitar (83) pink exercise dog (84) terrible knife (85) nice umbrella (86) square fire extinguisher (87) aqua apple (88) brand-new leather ruler (89) crimson banana (90) whittling bottle (91) vietnamese prismlike pencil (92) large ball (93) exercise piano (94) ridiculous pencil (95) drinking chair (96) good small camera (97) prismlike bicycle (98) awful chair (99) triangular canvas (100) big screwdriver (101) german hat (102) massive trash can (103) wonderful toolbox (104) glass house (105) eating box (106) whittling jar (107) bangladeshi prismlike plate (108) archaic wrench (109) ethiopian triangular canvas (110) silly massive plier (111) eating ring (112) egyptian triangular plate (113) rectangular car (114) normal-size car key (115) archaic rubber marker (116) nice little flower pot (117) terrible wallet (118) silly chair (119) old-fashioned glass table (120) small vase (121) wonderful little speaker (122) nice hat (123) enormous screwdriver (124) bangladeshi circular saw (125) old camera (126) massive ring (127) bangladeshi prismlike bed (128) white wrench (129) filipino plate (130) old earring (131) new plate (132) little toolbox (133) indian spoon (134) mysterious tiny fire extinguisher (135) old-fashioned toolbox (136) prismlike indigo fiberglass ring (137) walking saw (138) circular watch (139) awful trash can (140) old iron pencil (141) thai triangular bag (142) white eating car (143) repulsive clock (144) old-fashioned steel banana (145) silly necklace (146) gray candle (147) pakistani rectangular trash can (148) circular cup (149) good small fire extinguisher (150) teal smoking shirt (151) eating trash can (152) archaic iron sofa (153) huge scissors (154) filipino hammer (155) brand-new box (156) mysterious medium-size guitar\\n\\nIn this variant of English, which of the following sentences (Options A-J) use the correct adjective order? If none of the sentences (Options A-J) use the correct adjective order, select option K. Select all that apply.\\n(A) iranian square violet smoking old-fashioned rubber repulsive large screwdriver\\n(B) mysterious little maroon whittling ancient cloth prismlike brush\\n(C) silly midsize crimson eating archaic leather square brazilian banana\\n(D) tiny repulsive maroon drinking ancient glass pyramidal congolese fork\\n(E) green snorkeling archaic iron large awful toolbox\\n(F) filipino spherical yellow exercise ancient wool medium-size good calculator\\n(G) triangular violet drinking brand-new fiberglass small repulsive watch\\n(H) russian prismlike aqua snorkeling old cloth house\\n(I) massive lovely gray walking new cloth pen\\n(J) wonderful tiny gray walking archaic wood brush\\n(K) None of the above\\n\\nProvide your final answer as a concatenation of all the correct choices. For example, if B and C have correct adjective order, then your final answer must be \"BC\". \\nLet\\'s think step by step and output the final answer within \\\\boxed{}.\\nassistant\\n',\n",
       " 'response': '<think>\\nOkay, let\\'s tackle this problem. The user has given a list of sentences where adjectives are ordered correctly in a variant of English. My job is to check which of the options A-J follow the same correct order.\\n\\nFirst, I need to recall what the correct adjective order is. In standard English grammar, adjectives usually follow a specific order: opinion, size, age, shape, color, origin, material, and purpose. However, in some other languages or variants, the order might differ. Since the problem is about a variant, I need to figure out the correct order based on the examples provided.\\n\\nLooking at the examples given (sentences 1-50 or so), I can analyze the order of adjectives in each. For instance, in example (1) \"ancient key\", the adjective is \"ancient\", which in the given list is the first part of the sentence. Another example is (2) \"pyramidal earring\". The adjective here is \"pyramidal\", which seems to be a shape adjective. In some variants, the order might be different, but let\\'s think of the order in the examples.\\n\\nWait, looking at the examples, let me try to deduce the order. Let me check the list.\\n\\nTake the example (1) ancient key. So \"ancient\" is the first adjective. In the given sentences, the order might be \"age\" (ancient, old, new), then \"shape\" (pyramidal, spherical, triangular), then \"color\" (red, green, yellow), then \"origin\" (nigerian, indian, mexican), then \"material\" (leather, steel, wood), then \"purpose\" (snorkeling, eating, driving, etc.)?\\n\\nBut that\\'s not certain. Alternatively, maybe the adjectives are ordered as: opinion (lovely, terrible), size (big, small), age (ancient), shape (pyramidal), color (red, yellow, etc.), origin (nigerian), material, then purpose? Wait, looking at example (3) \"repulsive normal-size hammer\". The adjective order here is \"repulsive\", \"normal-size\". So perhaps in this variant, the order is opinion (opinion), than size, then age, then shape, then color, then origin, then material, then purpose.\\n\\nWait, but the given example (3) is \"repulsive normal-size hammer\". So \"repulsive\" is likely an opinion adjective (which is a type of opinion?), then \"normal-size\", which is a size adjective. Then the noun. So the order could be opinion, size, then other? Let\\'s check other examples.\\n\\nLooking at (5) square wrench. \"Square\" is a shape adjective. But \"square\" would be a shape, so that\\'s the shape. So in the ordered list, the adjectives are ordered as opinion, size, age (ancient, old), shape (pyramidal, square), color (red, yellow), origin (nigerian, indian, mexican), material (leather, steel, wood), purpose.\\n\\nBut maybe that\\'s not the case. Maybe in the given variant, the order is: opinion, then size, then age, then shape, then color, then origin, then material, then purpose? Or a different sequence?\\n\\nAlternatively, perhaps the order is:\\n\\n1. Opinion (e.g., awful, terrible, nice, good)\\n2. Size (e.g. normal-size, large, small, big, tiny, huge)\\n3. Age (e.g., ancient, old, new, old-fashioned)\\n4. Shape (pyramidal, square, triangular, spherical, etc.)\\n5. Color (red, yellow, green, etc.)\\n6. Origin (nigerian, indian, mexican, etc.)\\n7. Material (leather, steel, wood, etc.)\\n8. Purpose (snorkeling, driving, eating, etc.)\\n\\nWait, looking at example (5) is \"square wrench\" which is shape adjective (square) as the first part. But that example doesn\\'t have an opinion or size. However, (6) \"awful large apple\" has \"awful\" as opinion, \"large\" as size, then the noun. (7) \"nigerian box\" has origin as the first adjective. So in that case, \"nigerian\" is origin. So maybe the correct order is:\\n\\n[Opinion] [Size] [Age] [Shape] [Color] [Origin] [Material] [Purpose]\\n\\nBut wait, (7) is \"nigerian box\". The order here is origin (nigerian). So that\\'s origin first, but in that case, how about (3): \"repulsive normal-size hammer\". \"Repulsive\" is opinion, \"normal-size\" is size. So that would be in the order of opinion, then size.\\n\\nWait, this seems conflicting. If (3) has \"opinion (repulsive) + size (normal-size)\", then the order is opinion, then size, then the rest.\\n\\nBut (7) is origin, so the order would be origin (if that comes before shape, color, etc.)?\\n\\nAlternatively, perhaps the order in this variant is:\\n\\nOpinion → Size → Age → Shape → Color → Origin → Material → Purpose\\n\\nBut for example, (3) is \"repulsive normal-size hammer\"—so conceptually, that\\'s opinion, size. Then (7) is origin, but maybe origin is considered after the shape? Or perhaps the given adjectives in example (3) have \"normal-size\" as size (which is correct), and that\\'s in the correct position.\\n\\nAlternatively, the adjectives are ordered as:\\n\\nOpinion, size, age, shape, color, origin, material, purpose.\\n\\nBut for example, (3) is repulsive (opinion), normal-size (size), hammer (noun). So in the correct order.\\n\\nIn example (1) \"ancient key\" the adjective is \"ancient\"—so that\\'s an age adjective. Maybe the correct order is:\\n\\nOpinion, size, age, shape, color, origin, material, purpose.\\n\\nWait, but that would be for example, if a sentence has \"ancient\" (age) as the first adjective, then it would be [age] in the sequence. But \"ancient\" comes after size and opinion?\\n\\nBut that seems not possible given that example (3) is \"repulsive (opinion) normal-size (size) hammer\", but \"ancient (age)\" is in the correct position in (1), (3) (but no, (3) is repulsive (opinion) normal-size (size) hammer). So if \"ancient\" is an age adjective, then it could be placed after size and opinion.\\n\\nWait, maybe I should think of the correct order for the given examples.\\n\\nTake example 6: \"awful large apple\". \"Awful\" is an opinion adjective. \"Large\" is a size. So the order is opinion, size. Then the noun.\\n\\nIn example 4: \"red snorkeling bag\". \"Red\" is a color, and \"snorkeling\" is a purpose. So the correct order in this variant for that sentence would be color followed by purpose. But that seems odd since \"red\" is color and \"snorkeling\" is purpose.\\n\\nBut according to that, the order would be color, then purpose, but in other examples, like \"repulsive normal-size hammer\" which has opinion (repulsive), size (normal-size) then the noun. So the order seems to be [opinion], [size], [age], [shape], [color], [origin], [material], [purpose], etc.\\n\\nAlternatively, perhaps the order is [age], [shape], [color], [origin], [material], [purpose], etc.\\n\\nBut in example (5) is \"square wrench\", which is shape (square) is an adjective, and (3) is \"repulsive normal-size hammer\". But \"repulsive\" is opinion, \"normal-size\" is size. So that\\'s [opinion], [size], and then the noun. But according to given examples, the correct order in the variant.\\n\\nLooking at example (3) is \"repulsive normal-size hammer\". In this case, if that has correct order, then opinion (repulsive), then size (normal-size). Then the noun. So that would be opinion, size.\\n\\nThen look at example (1) \"ancient key\"—ancient is age. That seems like a different adjective. So in this variant, order of adjectives is perhaps:\\n\\nopinion (like awful, terrible, nice, good) → size (like large, small, tiny) → age (like ancient, old, old-fashioned) → shape (pyramidal, square, triangular, etc.) → color (red, yellow, green) → origin (nigerian, indian, etc.) → material (leather, steel, etc.) → purpose (snorkeling, driving, etc.)\\n\\nThis seems like a possible order.\\n\\nBut then, in example (1), the only adjective is \"ancient\"—so it would be in the age category. So how is that possible if the order is opinion, size, age, shape, etc.?\\n\\nWait, but the given sentence (1) is \"ancient key\"—so if the adjective is in the \"age\" category, that\\'s in the correct position, without any other adjectives before it. So in the variant, the adjectives may be ordered in that order, and some sentences have only one adjective (like (1), (5), (7), (10), etc. But if the order is opinion → size → age → shape → color → origin → material → purpose.\\n\\nSo in that case, the correct order would be for adjectives to be in that sequence.\\n\\nNow, the problem is that when you have multiple adjectives, they need to follow this sequence. For example, if a sentence has \"repulsive (opinion) normal-size (size) hammer\" (example (3)), that is correct because the order is opinion, then size.\\n\\nBut if a sentence has \"normal-size (size) ancient (age) square (shape) red (color) etc\", this would be in order.\\n\\nSo now, looking at the options A-J. Each option is a list of adjectives in order. For each option, we need to check if their adjectives follow the correct order based on the example.\\n\\nLet me go through each option one by one.\\n\\nOption A: \"iranian square violet smoking old-fashioned rubber repulsive large screwdriver\"\\n\\nBreaking it down into adjectives and their possible categories:\\n\\nThe adjectives in order are: iranian (origin), square (shape), violet (color), smoking (purpose?), old-fashioned (age), rubber (material), repulsive (opinion), large (size), screwdriver (noun).\\n\\nWait, but the correct order would be:\\n\\nopinion (repulsive), size (large), age (old-fashioned), shape (square), color (violet), origin (iranian), material (rubber), purpose (smoking) ? But the order in the option is [iranian (origin), square (shape), violet (color), smoking (purpose), old-fashioned (age), rubber (material), repulsive (opinion), large (size), screwdriver].\\n\\nBut according to the correct order in the variant, this would not be in order. Because the adjectives must be ordered as opinion, size, age, shape, color, origin, material, purpose.\\n\\nWait, the order in the correct variant is probably: opinion → size → age → shape → color → origin → material → purpose.\\n\\nBut the option A\\'s adjectives are in the order: origin (iranian) then shape (square), color (violet), purpose (smoking). That\\'s not in the correct sequence. So this would be incorrect.\\n\\nWait, maybe I need to evaluate the order of the adjectives in each option.\\n\\nWait, for example, the correct order is perhaps:\\n\\n[Opinion, Size, Age, Shape, Color, Origin, Material, Purpose]\\n\\nSo the first adjective in the list of adjectives should be opinion. But in option A, the first adjective is \"iranian\" which is origin. So this would be in the wrong order. So the order is not correct.\\n\\nHmm. Let\\'s take another look. The example (7) is \"nigerian box\"—the adjective is origin (nigerian) and that is the first adjective. But according to the order I thought, the position of origin is after color, shape, etc. So that seems conflicting. Therefore, perhaps the order of adjectives in this variant is different.\\n\\nWait, looking at the examples given, for example:\\n\\n(1) ancient key → \"ancient\" (age) is first.\\n\\n(2) pyramidal earring → \"pyramidal\" (shape) is first.\\n\\n(3) repulsive normal-size hammer → \"repulsive\" (opinion), \"normal-size\" (size). Both adjectives. So the order is opinion, size.\\n\\n(4) red snorkeling bag → red (color), snorkeling (purpose).\\n\\n(5) square wrench → square (shape).\\n\\n(6) awful large apple → awful (opinion), large (size).\\n\\n(7) nigerian box → nigerian (origin).\\n\\n(8) yellow snorkeling bed → yellow (color), snorkeling (purpose).\\n\\n(9) driving cup → driving (purpose).\\n\\n(10) snorkeling stapler → snorkeling (purpose).\\n\\n(11) terrible little banana → terrible (opinion), little (size).\\n\\n(12) gray pencil → gray (color).\\n\\n(13) cyan smoking brush → cyan (color), smoking (purpose).\\n\\n(14) brown canvas → brown (color).\\n\\n(15) nice big screwdriver → nice (opinion), big (size).\\n\\nFrom these examples, we can see that adjectives can be in different orders. For example:\\n\\nIn (4) \"red snorkeling bag\" the color is first, then purpose.\\n\\nIn (3) \"repulsive normal-size hammer\" the opinion is first, then size.\\n\\nIn (7) \"nigerian box\" the origin is first.\\n\\nIn (6) \"awful large apple\" the opinion is first, then size.\\n\\nThis seems inconsistent with a fixed order. Thus, maybe the order is not a strict opinion, then size, then shape, etc., but rather that the adjectives are ordered according to their position in the examples. For example, if the first adjective is the more specific one, like shape (in sentence 2), color (4), origin (7), etc.\\n\\nBut perhaps the correct order is: those adjectives that describe the shape come first, then color, then origin, then material, then purpose, and so on. It\\'s not clear.\\n\\nAlternatively, maybe the correct order is: adjectives with the most specific types of descriptions. For example, the sequence of the examples from (1) has \"ancient\" (age), (2) \"pyramidal\" (shape), (3) \"repulsive\" (opinion), (4) \"red\" (color), (5) \"square\" (shape), (6) \"awful\" (opinion), \"large\" (size), (7) \"nigerian\" (origin). \\n\\nBut again, this is confusing. So perhaps the order of the adjectives in the correct variant is not based on a fixed sequence, but the order in which the adjectives are given in the list. But that\\'s not possible. The user provided that the given sentences (1-50 etc.) have correct adjectives.\\n\\nWait, in the problem statement, the correct order is such that the example sentences (listed 1-150?) have correct adjectives. So the correct order for adjectives is defined by these examples, and we need to determine what the order is.\\n\\nBut the task is to find out based on the given examples. So for example, if adjectives are ordered as [opinion, size, age, shape, color, origin, material, purpose], then the correct order of the adjectives is such. Alternatively, if the order is [shape, color, origin, size, age, etc.].\\n\\nAlternatively, maybe the order is: for each sentence, the adjective is ordered as per the following example. Let\\'s think of some examples where the adjective sequence is as per the given sentences.\\n\\nTake sentence 1: \"ancient key\" → \"ancient\" is the only adjective here. So it\\'s age.\\n\\nSentence 3: \"repulsive normal-size hammer\" → op, size.\\n\\nSentence 4: \"red snorkeling bag\" → color, purpose.\\n\\nSentence 3 is (repulsive, normal-size, hammer), which is opinion, size.\\n\\nSentence 5: square wrench → shape.\\n\\nSentence 6: awful large apple → opinion, size.\\n\\nSentence 7: nigerian box → origin.\\n\\nSentence 11: terrible little banana → opinion, size (terrible is opinion, little is size).\\n\\nThis seems to follow the order: opinion, size, or origin, shape, etc. But how can these be ordered in the correct way?\\n\\nAlternatively, maybe the order is that adjectives are ordered as described in the example sentences, and we can find the correct sequence in the example sentences. But given that the variants have some examples with different adjectives, how to deduce the correct order.\\n\\nAlternatively, the correct adjective order is that the adjectives are ordered as follows: (1) opinion, then (2) size, then (3) age, then (4) shape, then (5) color, then (6) origin, then (7) material, then (8) purpose. So this is the standard order in some languages.\\n\\nFor example, in sentence (3): repulsive (opinion), normal-size (size) → that\\'s correct.\\n\\nSentence (6): awful (opinion), large (size) → correct.\\n\\nSentence (7): nigerian (origin) → that is correct as the origin would be after other adjectives. Wait, but that conflicts with the order above. Because according to this order, origin comes after color, so if origin is first, then it would be in the wrong order. But in the example sentence (7), \"nigerian box\" has \"nigerian\" as the first adjective, which would be origin. So according to that, the correct order allows origin to come before color, shape, etc. So the order in this variant could be different.\\n\\nBut in that case, why does sentence (4) have \"red\" (color) and \"snorkeling\" (purpose) in that order? If the color is first, then the purpose comes after. But in the variant, maybe the order is color, then purpose. So perhaps the order is: opinion (opinion), size (size), age (age), shape (shape), color (color), origin (origin), material (material), purpose (purpose). But according to that, the origin (nigerian) should come after color.\\n\\nSo for the example (7) \"nigerian box\" would have origin as first, but that would conflict with this order. So maybe the correct order is different.\\n\\nAlternatively, maybe the correct order for adjectives is: shape → color → origin → material → size → age → purpose → opinion.\\n\\nBut this is unclear. Given that the examples are not consistent, it\\'s difficult to say. So perhaps, the correct approach is to look at the given examples and find out for the options which ones have adjectives in the same order as the example sentences, but how?\\n\\nBut perhaps we can find a pattern: the adjectives in examples must follow the same order as in the examples. For example, in the sentence (2) pyramidal earring, the adjective is shape. In (3) repulsive (opinion), then normal-size (size). In (5) square (shape). In (7) nigerian (origin) is the first adjective. So the order of adjectives can vary based on the adjective type.\\n\\nBut this seems too vague. However, since the problem is presented as a multiple-choice question, there must be a specific order for the adjectives in the variant English.\\n\\nLet me try to think of the correct order as follows:\\n\\nThe adjectives are ordered as: size, shape, color, origin, material, purpose, age, opinion.\\n\\nBut this is arbitrary. Alternatively, looking at example (1) \"ancient key\" → age is first. Example (3) \"repulsive normal-size hammer\" → opinion, size. So if \"normal-size\" is size, and \"repulsive\" is opinion.\\n\\nSo perhaps the correct order is: opinion, size, age, shape, color, origin, material, purpose.\\n\\nBut how to check this? Let\\'s take the example (4) \"red snorkeling bag\" → red is color, then snorkeling is purpose. So if this is correct, but according to the order, color is after shape and before origin.\\n\\nWait, but according to the order, the correct sequence is opinion → size → age → shape → color → origin → material → purpose.\\n\\nIn (4), the adjectives are color (red), purpose (snorkeling). The correct order would be color → origin → material → purpose?\\n\\nNo, that doesn\\'t make sense. This is not helping.\\n\\nMaybe the adjectives in the variant are in the sequence: size, shape, color, origin, material, purpose.\\n\\nBut in the examples:\\n\\nSentence 1: ancient key → only age. So that would be in the correct position.\\n\\nSentence 3: repulsive normal-size hammer → opinion, size.\\n\\nSentence 4: red snorkeling bag → color, purpose.\\n\\nSentence 5: square wrench → shape.\\n\\nSentence 7: nigerian box → origin.\\n\\nSentence 6: awful large apple → opinion, size.\\n\\nSentence 9: driving cup → purpose.\\n\\nSentence 10: snorkeling stapler → purpose.\\n\\nSentence 11: terrible little banana → opinion, size.\\n\\nTherefore, the order of adjectives is not consistent. But the correct order must be such that the list of options A-J have the correct sequence.\\n\\nSince the problem is from a test, and the options are given, the correct way is to see which of the options match the order as given by the examples. Since the examples are correct, the correct adjectives in the variant are ordered in a specific way, and the options are to be checked against that.\\n\\nBut without knowing the exact order, it\\'s hard. Maybe the modal rule is that the adjectives are ordered in the same way as the examples. Let\\'s take option (C): \"silly midsize crimson eating archaic leather square brazilian banana\".\\n\\nThe adjectives are: silly (opinion), midsize (size), crimson (color), eating (purpose), archaic (age), leather (material), square (shape), brazilian (origin), banana (noun).\\n\\nAccording to the order opinion, size, color, purpose, age, material, shape, origin.\\n\\nWait, this is just a guess. But in the example sentences, the adjectives are ordered in different ways, but for the variant, the correct order is similar to the examples.\\n\\nAlternatively, let\\'s think of how the adjectives are arranged in the given example sentences. For instance, the example (3) is \"repulsive normal-size hammer\" → opinion, size. (11) \"terrible little banana\" → opinion, size. Similarly, (6) \"awful large apple\" → opinion, size. (7) \"nigerian box\" is origin. (5) \"square wrench\" is shape.\\n\\nSo the way the adjectives are ordered in the variant could be that opinion, size, age, shape, color, origin, material, purpose.\\n\\nThus, for the given options, we need to check if the order of adjectives follows that.\\n\\nLet\\'s check option (B): \"mysterious little maroon whittling ancient cloth prismlike brush\"\\n\\nBreaking down the adjectives: mysterious (opinion), little (size), maroon (color), whittling (purpose), ancient (age), cloth (material), prismlike (shape?), brush (noun).\\n\\nWait, the adjectives are mysterious (opinion), little (size), maroon (color), whittling (purpose), ancient (age), cloth (material), prismlike (shape), brush (noun).\\n\\nIf the order is opinion, size, color, purpose, age, material, shape, then this sequence is not correct. Wait, the order would be opinion (mysterious), size (little), then color (maroon), then purpose (whittling), then age (ancient), then material (cloth), then shape (prismlike). But this would be the correct sequence.\\n\\nWait, in this case, the order of adjectives in option B is: [opinion, size, color, purpose, age, material, shape]. But this would be in the order opinion, size, color, purpose, age, material, shape, but in the correct order is opinion, size, age, shape, color, origin, material, purpose. So this doesn\\'t match.\\n\\nBut this is getting complicated.\\n\\nAlternatively, the correct order is opinion, size, age, shape, color, origin, material, purpose.\\n\\nSo the adjectives in each option should follow this order.\\n\\nLet\\'s analyze option (B): \"mysterious little maroon whittling ancient cloth prismlike brush\".\\n\\nThe adjectives are:\\n\\nmysterious (opinion) → correct position.\\nlittle (size) → correct.\\nmaroon (color) → should be after age, shape, but in this case, it\\'s color here. Wait, the correct order is opinion, size, age, shape, color, origin, material, purpose.\\n\\nSo in option B, the order of the adjectives is: opinion (mysterious), size (little), then maroon (color) should come after age and shape. But in this sentence, after size is the color (maroon), then purpose (whittling), then age (ancient), which is incorrect. Because age should come before shape, and color comes after shape.\\n\\nSo for B, the adejectives are:\\n\\nopinion (correct), size (correct), maroon (color), then purpose (purpose), then age (age), which is incorrect. Because after size, the next should be age, then shape, then color. So this sequence is incorrect.\\n\\nSo B is incorrect.\\n\\nLet\\'s look at option C: \"silly midsize crimson eating archaic leather square brazilian banana\".\\n\\nAdjectives: silly (opinion), midsize (size), crimson (color), eating (purpose), archaic (age), leather (material), square (shape), brazilian (origin), banana (noun).\\n\\nOrder of adjectives: opinion, size, color, purpose, age, material, shape, origin.\\n\\nThe correct order should be: opinion, size, age, shape, color, origin, material, purpose.\\n\\nBut in this option, the color (crimson) is before purpose (eating), which would be incorrect. Because in the correct order, shape should be after age and before color. So the color is at position 3, but it should be after shape.\\n\\nThus, this option is incorrect.\\n\\nOption D: \"tiny repulsive maroon drinking ancient glass pyramidal congolese fork\".\\n\\nAdjectives: tiny (size), repulsive (opinion), maroon (color), drinking (purpose), ancient (age), glass (material), pyramidal (shape), congolese (origin), fork (noun).\\n\\nThe order of adjectives: size (tiny), opinion (repulsive), color (maroon), purpose (drinking), age (ancient), material (glass), shape (pyramidal), origin (congolese), fork.\\n\\nThe correct order is opinion, size, age, shape, color, origin, material, purpose.\\n\\nIn this option, the first is size (tiny), then opinion (repulsive), which is incorrect because opinion should come before size in the sequence. So the first two adjectives are size (tiny), opinion (repulsive). But the correct order should be opinion (repulsive) first, then size (tiny). So this is incorrect. Hence, D is wrong.\\n\\nOption E: \"green snorkeling archaic iron large awful toolbox\".\\n\\nAdjectives: green (color), snorkeling (purpose), archaic (age), iron (material), large (size), awful (opinion), toolbox (noun).\\n\\nThe order is color, purpose, age, material, size, opinion, toolbox.\\n\\nThe correct sequence would be opinion, size, age, shape, color, origin, material, purpose.\\n\\nHere, the order is color (green) first, then purpose (snorkeling), then age (archaic), then material (iron), then size (large), then opinion (awful). This is clearly inincorrect order, as opinion should come first. So E is wrong.\\n\\nOption F: \"filipino spherical yellow exercise ancient wool medium-size good calculator\".\\n\\nAdjectives: filipino (origin), spherical (shape), yellow (color), exercise (purpose), ancient (age), wool (material), medium-size (size), good (opinion), calculator (noun).\\n\\nOrder: origin (filipino), shape (spherical), color (yellow), purpose (exercise), age (ancient), material (wool), size (medium-size), opinion (good), calculator.\\n\\nCorrect order is opinion, size, age, shape, color, origin, material, purpose.\\n\\nThis is incorrect because the first adjective is origin, which should come after color and shape.\\n\\nOption G: \"triangular violet drinking brand-new fiberglass small repulsive watch\".\\n\\nAdjectives: triangular (shape), violet (color), drinking (purpose), brand-new (new, which might be size or age?), fiberglass (material), small (size), repulsive (opinion), watch (noun).\\n\\nOrder: shape (triangular), color (violet), purpose (drinking), brand-new (age?), material (fiberglass), size (small), opinion (repulsive), watch.\\n\\nWait, if brand-new is an age, then in correct order, it should be after size. But this is getting complicated. The correct order is opinion, size, age, shape, color, origin, material, purpose. But here, the first adjective is shape (triangular), color (violet), purpose (drinking), then age (brand-new?), material (fiberglass), then size (small), opinion (repulsive). So this sequence is incorrect, as opinion should come first.\\n\\nOption H: \"russian prismlike aqua snorkeling old cloth house\".\\n\\nAdjectives: russian (origin), prismlike (shape), aqua (color), snorkeling (purpose), old (age), cloth (material), house (noun).\\n\\nOrder: origin (russian), shape (prismlike), color (aqua), purpose (snorkeling), age (old), material (cloth), house.\\n\\nCorrect order: opinion, size, age, shape, color, origin, material, purpose.\\n\\nThis is incorrect. Origin (russian) should come after color.\\n\\nOption I: \"massive lovely gray walking new cloth pen\".\\n\\nAdjectives: massive (size?), lovely (opinion), gray (color), walking (purpose), new (age?), cloth (material), pen (noun).\\n\\nOrder: size, opinion, color, purpose, age, material, pen.\\n\\nCorrect order requires opinion first, then size. So here, the first adjective is size (massive), but opinion is second. This is incorrect. So I is incorrect.\\n\\nOption J: \"wonderful tiny gray walking archaic wood brush\".\\n\\nAdjectives: wonderful (opinion), tiny (size), gray (color), walking (purpose), archaic (age), wood (material), brush (noun).\\n\\nOrder: opinion, size, color, purpose, age, material, brush.\\n\\nCorrect order is opinion (first), size (second), then age, shape, color, origin, material, purpose. In this example, after size (tiny), it\\'s color (gray), then purpose (walking), then age (archaic), then material (wood). So the order would be opinion, size, color, purpose, age, material. But the correct order after size is age, shape, color, origin, material, purpose. So this is incorrect. Because color should come after shape, but here, color is after purpose. So this is wrong.\\n\\nWait, maybe I\\'m making a mistake here. Let me re-express:\\n\\nIf the correct order is opinion, size, age, shape, color, origin, material, purpose.\\n\\nThen, in option J: \"wonderful tiny gray walking archaic wood brush\".\\n\\nadeljectives are: wonderful (opinion), tiny (size), gray (color), walking (purpose), archaic (age), wood (material), brush.\\n\\nWait, but the sequence is:\\n\\nopinion (correct), size (correct), then color (gray), which should come after age and shape. But in this case, after size (tiny) comes gray (color), then purpose (walking), then age (archaic), which is wrong because age should come after size, then shape, and before color.\\n\\nWait, this example is wrong. So the sequence is opinion, size, color, purpose, age, material. But according to the correct order, it should be opinion, size, age, shape, color, origin, material, purpose.\\n\\nSo in option J, the sequence after size (tiny) is gray (color), then purpose, then age (archaic). But this is not in order.\\n\\nHence, none of the options are correct. But this contradicts with the fact that the answer might be K (none of the above). But the user said to select all that apply, and if none, select K.\\n\\nSo from the above, I think none of the options are correct. Hence, the correct answer is K.\\n\\nBut let me check again for each option carefully.\\n\\nOption A: \"iranian square violet smoking old-fashioned rubber repulsive large screwdriver\".\\n\\nThe adjectives are: origin (iranian), shape (square), color (violet), purpose (smoking), age (old-fashioned), material (rubber), opinion (repulsive), size (large).\\n\\nThe correct order for the variant is opinion, size, age, shape, color, origin, material, purpose. But in this option, the first is origin, then shape, then color, then purpose, then age, then material, then opinion, then size. This is clearly incorrect.\\n\\nOption C: \"silly midsize crimson eating archaic leather square brazilian banana\".\\n\\nThe adjectives are: opinion (silly), size (midsize), color (crimson), purpose (eating), age (archaic), material (leather), shape (square), origin (brazilian). The proper order is opinion, size, age, shape, color, origin, material, purpose. But here, after size, it\\'s color (crimson), which is before the correct age (archaic) and shape (square). This is incorrect.\\n\\nOption F: \"filipino spherical yellow exercise ancient wool medium-size good calculator\".\\n\\nThe adjectives are: origin (filipino), shape (spherical), color (yellow), purpose (exercise), age (ancient), material (wool), size (medium-size), opinion (good). The correct order is opinion, size, age, shape, color, origin, material, purpose. But here, origin is first, which should come after color.\\n\\nNone of the options seem to be correct, so the answer is K.\\n</think>\\n\\nTo solve this, we need to determine which of the given options follow the correct **adjective order** in the variant variant of English, based on the provided sentences. The key is to deduce the adjectives\\' order from the examples, which suggests a structure that is **not strictly standard** but **follows a self-sustaining order pattern** that appears in the listed examples.\\n\\n---\\n\\n### Observations from Examples:\\n\\nFrom the given sentences, the adjectives in order suggest a **specific sequence** of adjective types, though it is not a universally accepted rule. The most consistent deductions are:\\n\\n1. **Opinion** (e.g., \"silly\", \"bad\", \"terrible\")\\n2. **Size** (e.g., \"large\", \"tiny\", \"medium-size\")\\n3. **Age** (e.g., \"ancient\", \"old\", \"new\")\\n4. **Shape** (e.g., \"pyramidal\", \"spherical\", \"triangular\")\\n5. **Color** (e.g., \"red\", \"green\", \"blue\")\\n6. **Origin** (e.g., \"nigerian\", \"indian\", \"mexican\")\\n7. **Material** (e.g., \"wood\", \"leather\", \"steel\")\\n8. **Purpose** (e.g., \"snorkeling\", \"driving\", \"eating\")\\n\\nThis is the general pattern of adjective order we used to evaluate the options.\\n\\n---\\n\\n### Evaluating Options (A–J):\\n\\n- **(A)**: \"iranian square violet smoking old-fashioned rubber repulsive large screwdriver\"  \\n  → **Wrong**: Origin (iranian) is **not** the first adjective. Opinions, size, and other earlier adjectives such as age are misplaced.\\n\\n- **(B)**: \"mysterious little maroon whittling ancient cloth prismlike brush\"  \\n  → **Wrong**: The order includes opinion (mysterious), size (little), color, then purpose, age, material, shape, origin. The adjectives jump between categories inconsistently.\\n\\n- **(C)**: \"silly midsize crimson eating archaic leather square brazilian banana\"  \\n  → **Wrong**: After size (midsize) comes **color (crimson)**, then **purpose (eating)**, not age or shape. The sequence deviates from the expected order (opinion, size, age, shape, ...).\\n\\n- **(D)**: \"tiny repulsive maroon drinking ancient glass pyramidal congolese fork\"  \\n  → **Wrong**: Size (tiny) is first, followed by **opinion (repulsive)**, which is not the correct order. Opinions should be earlier than sizes, not the other way around.\\n\\n- **(E)**: \"green snorkeling archaic iron large awful toolbox\"  \\n  → **Wrong**: Starts with **color (green)**, then purpose (snorkeling), not opinion or size.\\n\\n- **(F)**: \"filipino spherical yellow exercise ancient wool medium-size good calculator\"  \\n  → **Wrong**: Origin (filipino) is the first adjective, which again breaks the expected order (opinion, size, age...).\\n\\n- **(G)**: \"triangular violet drinking brand-new fiberglass small repulsive watch\"  \\n  → **Wrong**: Shape is first, but opinions and sizes should come before shape.\\n\\n- **(H)**: \"russian prismlike aqua snorkeling old cloth house\"  \\n  → **Wrong**: Origin (russian) is far too early in the sequence.\\n\\n- **(I)**: \"massive lovely gray walking new cloth pen\"  \\n  → **Wrong**: Size (massive) is first, followed by **opinion (lovely)**, which is misordered.\\n\\n- **(J)**: \"wonderful tiny gray walking archaic wood brush\"  \\n  → **Wrong**: The sequence is opinion, size, color, purpose, age, material. The current order is inconsistent with the expected sequence.\\n\\n---\\n\\n### Conclusion:\\n\\nNone of the given options A through J follow the correct **adjective order** as seen in the provided variant of English.\\n\\n$$\\n\\\\boxed{K}\\n$$',\n",
       " 'ground_truth': 'H',\n",
       " 'score': {'score': 0.0, 'soft_format': 0.25, 'hard_format': 0.25},\n",
       " 'reason': \"score: {'score': 0.0, 'soft_format': 0.25, 'hard_format': 0.25}\",\n",
       " 'final_reward': 0,\n",
       " 'uuid': '59f39b13-01e7-41e9-bba2-280a7b503bc4'}"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "id": "1e19a2fc",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  6%|▋         | 128/2032 [00:01<00:23, 80.73it/s]We did not manage to extract a prediction in the correct format. Gold: ['\\\\boxed{no, no, no}'], Pred: ['To determine the truthfulness of **Jerry**, **Ryan**, and **Vina**, we analyze the logical structure of their respective statements and the inferences that can be drawn from other statements in the network of people.\\n\\n---\\n\\n### **Jerry\\'s Statement**\\n\\nJerry says:  \\n> \"Only one of Fletcher, Owen, and Miranda lies.\"\\n\\nTo be true, this would imply that exactly one of the three is a liar. However, based on the information given:\\n\\n- **Owen and Miranda** are **opposite in type**: one is truthful, the other is a liar.\\n- Hence, at least **one of the three** (Fletcher, Owen, or Miranda) is a liar.\\n- The **possibility of exactly one liar** is **not guaranteed**, as the status of **Fletcher** is not known from other statements.\\n\\nTherefore, the truth of Jerry\\'s statement depends on **Fletcher\\'s** truth status, which we **cannot determine** uniquely. This makes **Jerry\\'s** status **unknown**.\\n\\n---\\n\\n### **Vina\\'s Statement**\\n\\nVina says:  \\n> \"Jerry tells the truth.\"\\n\\nIf **Vina** is telling the truth, then **Jerry** must also be truthful.  \\nIf **Vina** is lying, then **Jerry** must be a liar.  \\nSince **Jerry** is **unknown**, **Vina\\'s** status is **dependent on Jerry** and thus **also unknown**.\\n\\n---\\n\\n### **Ryan\\'s Statement**\\n\\nRyan says:  \\n> \"Either exactly two of Amberly, Oden, and Odessa tell the truth, or none of them do.\"\\n\\nWe know that **Oden and Odessa are of the same type** (both truthful or both liars). This leads to two possible cases:\\n\\n- If **Amberly is a liar and Oden/Odessa are truthful**, then **two of the three are truthful** → this satisfies Ryan’s statement → **Ryan is truthful**.\\n- If **Amberly is truthful and Oden/Odessa are both liars**, then **none of the three are truthful** → this also satisfies Ryan’s statement → **Ryan is truthful**.\\n- If **Amberly is truthful and Oden/Odessa are truthful**, then **three are truthful']\n",
      " 46%|████▌     | 936/2032 [00:26<00:15, 69.82it/s] We did not manage to extract a prediction in the correct format. Gold: ['\\\\boxed{no, unknown, no}'], Pred: ['To determine the status of **Christie**, **Miguel**, and **Gwenn**, we must analyze the logical consistency of their statements given the constraint that **each person is either']\n",
      " 51%|█████     | 1032/2032 [00:28<00:14, 71.03it/s]We did not manage to extract a prediction in the correct format. Gold: ['\\\\boxed{behind, wellwisher, volcanism, iniquity, interpretations, inmate, individualization, implications, zip, yoghurt, toroidal, transformations, mollie, mcconnell, disenfranchisement, hornpipe, sandy, scorn}'], Pred: ['To sort the given list of words in the **new alphabet order**:\\n\\n**New Alphabet Order:**\\n```\\nb, w, v, p, l, u, i, z, e, y, a, t, n, j, m, q, f, d, x, o, k, g, c, r, h, s\\n```']\n",
      " 53%|█████▎    | 1068/2032 [00:29<00:19, 48.52it/s]We did not manage to extract a prediction in the correct format. Gold: ['\\\\boxed{no, unknown, yes}'], Pred: ['To determine whether *Vernell*, *Mona*, and *Ryan* tell the truth, we must carefully evaluate the logical constraints of their statements in the context of all the other statements provided. Given the mutual interdependencies among the individuals (some self-referential, others directly referring to others), each person’s truthfulness or liefulness becomes a part of a larger interwoven logic puzzle.\\n\\n---\\n\\n### ✅ **Vernell**  \\nVernell says:  \\n> \"Either exactly two of Delbert, Nicole, and Oleg tell the truth, or none of them.\"\\n\\nWhile the truth of this statement hinges on the truthfulness of Delbert, Nicole, and Oleg, it is plausible that Delbert is a **liar** (determined through a logical analysis of the truth of his statement about Helene, Ofelia, and Mickey). This allows the remaining two (Nicole and Oleg) to be **truth-tellers**, resulting in **exactly two** of the three being truthful, which makes **Vernell’s statement true**.\\n\\nThus, **Vernell is truthful**.\\n\\n---\\n\\n### 🧐 **Mona**  \\nMona says:  \\n> \"Oriana lies.\"\\n\\nMona\\'s statement directly contradicts the statement of **Megan**, who says Oriana tells the truth. This contradiction implies that **Mona and Megan are of opposite types** (Mona truthful → Megan liar, and vice versa). However, no other statements are provided that definitively determine the truth of either of them. The only additional constraint is that **Shalonda is a liar**, and her claim that **Crista, Megan, and Mona are all truthful** is false. This means that **at least one of them is lying**, but it could be any one of the three.\\n\\nTherefore, **Mona’s status cannot be determined from the given information**.\\n\\n---\\n\\n### 🧐 **Ryan**  \\nRyan says:  \\n> \"Christie tells the truth.\"\\n\\nTo determine whether this is true, we would need to determine whether **Christie** is truthful. However, **Christie’s own statement** is complex and involves the truth of **Shalonda, Octavius, and Maverick**, which are interconnected and not immediately resolvable from the provided data.\\n\\nThus, we are **unable']\n",
      " 64%|██████▍   | 1302/2032 [00:32<00:10, 71.71it/s]We did not manage to extract a prediction in the correct format. Gold: ['\\\\boxed{1}'], Pred: ['To determine the **position of the person who drives the Bajaj**, we begin by analyzing the constraints imposed by the clues, particularly those that involve motorbikes and their relationships.\\n\\n---\\n\\n### Key Observations and Logical Deductions:\\n\\n- **Motorbikes listed**: **Kawasaki, Buell, Bajaj, Benelli, Suzuki**']\n",
      " 96%|█████████▌| 1954/2032 [00:44<00:00, 82.54it/s] We did not manage to extract a prediction in the correct format. Gold: ['\\\\boxed{no, unknown, yes}'], Pred: ['To determine the truth values of **Kristian, Orin, and Dallas**, we analyze the logical implications of their statements and the status of other characters. Here\\'s the structured breakdown:\\n\\n---\\n\\n### **Key Deductions:**\\n\\n- **Delfina is a truth-teller** by assumption.\\n- **Fidel says** that **exactly one of Delfina, Orin, and Mark** tell the truth. \\n    - Delfina is a truth-teller, so this implies **Orin and Mark must both be liars** to maintain exactly one truth-teller in total (which makes Fidel’s statement *false*), and the rest are liars. This is logically consistent.\\n\\n- From the above, **Orin is a liar**, and **Mark is a liar**.\\n    - Then **Orin’s statement \"Mark lies\"** is false, implying **Mark is truthful**, which is a contradiction. However, this contradiction disappears due to a misreading: **Orin is a liar**, and **Mark is a liar** — so **Mark does not lie**, but **Orin\\'s statement is false**, making the statements compatible.\\n\\n- Thus, **Orin is a liar** and **Mark is a liar**, making **Fidel\\'s statement false** — hence, **Fidel is a liar**.\\n\\n- **Orhan** and **Minerva** contradict each other, so one is truthful and the other is a liar. However, without further statements about them, we cannot determine their true status, and their identity doesn\\'t directly affect **Kristian** or **Dallas**.\\n\\n- **Kristian** says: \"Exactly two of Tamika, Minerva, and Orhan tell the truth.\" This statement depends on **Tamika**, **Minerva**, and **Orhan**. While **Orhan** is known to be a **liar** (in this setup), we do not have enough information about Tamika or Minerva to determine whether the statement is true. Therefore, **Kristian\\'s status is unknown**.\\n\\n- **Dallas** says \"Amberly lies.\" The status of **Amberly** depends on **Kristian**. If **Kristian is truthful**, then **Amberly is truthful** → **Dallas is a liar**. If **Kristian is a liar**, then **Amberly is a liar** → **Dallas is truthful**. Since **Kristian is unknown**, **Dallas is also unknown**.\\n\\n---\\n\\n### **Final']\n",
      " 99%|█████████▉| 2007/2032 [00:45<00:00, 84.79it/s]We did not manage to extract a prediction in the correct format. Gold: ['\\\\boxed{ahɨka}'], Pred: ['To solve the problem, we need to determine the English translations of the given Xinka forms based on the provided verb forms in the Guazacapán Xinka list.\\n\\nThe forms in the rows of the list have two parts: a **base form** (first column) and a **conjugated form** (second column), with the **third column** providing the English translation for the **conjugated form**.\\n\\nLet’s analyze the given blank forms in the problem in the context of the original verb forms and their translations:\\n\\n---\\n\\n### Step-by-Step Translation of the Blanks\\n\\n#### (1) **herʼoŋ**  \\nThere is **no explicit form** in the original data for \"herʼoŋ\" as a conjugated form. However, considering the sentence “it chose (it)” in (4), we can assume that the verb **herʼoŋ** represents the **infinitive form** or **stem** of the verb “choose”, which in this context translates to **“choose”**.\\n\\n#### (2) **ɨniʂpaʔ**  \\nThis form closely resembles the **trigonal** form of the verb \"say, tell\", as in:  \\n**imʼay | ɨnimʼa | say, tell**.  \\nGiven the similarity and the lack of better alternatives, we infer that **ɨniʂpaʔ** translates to **“say, tell”**.\\n\\n#### (3) **aplaʔ**  \\nThis matches the form **aplayʼ** (first column) which, when conjugated to **ɨnapalʼa**, translates to “open (it)”. So, **aplaʔ** is a conjugated form of the same verb, and the translation remains **“open (it)”.\\n\\n#### (4) **it chose (it)**  \\nThis is a clause in the future or past indicative, and the base form used for the verb **herʼoŋ** is known to mean **“choose”**.\\n\\n#### (5) **I saw (it)**  \\nThis clearly refers to the past tense of the verb **see**, which is **ɨmbirʼi** in the original data. So its translation is **“see”**.\\n\\n#### (6) **I plant (it)**  \\nThe Xinka form for **“plant”** is not explicitly listed, but since it\\'s a form of **planting**, and assuming a logical connection with the verb **dig** from the original list, we\\'ll assign the equivalent concept as **“plant”**.\\n\\n#### (7) **it was trapped**  \\nThis is the **passive past tense** of the verb **trap** (k’aniy’), which translates to **“trap”**.\\n\\n#### (8) **I leap**  \\nNo form corresponding to **“leap”** is given in the original list, but **“leap”** is a common action and is likely to be **the translation** of this form.\\n\\n#### (9) **it is being sewn, woven**  \\nThis is the **present continuous tense** of **sew, weave**, which in the original data is **h']\n",
      "100%|██████████| 2032/2032 [00:45<00:00, 44.37it/s]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "correctness_response_length_list = {}\n",
    "from tqdm import tqdm\n",
    "for line in tqdm(data):\n",
    "    if line['data_source'] not in correctness_response_length_list:\n",
    "        correctness_response_length_list[line['data_source']] = []\n",
    "    question = line['prompt']\n",
    "    correctness_response_length_list[line['data_source']].append((question, getCorrectness(line['response'], line['ground_truth']), len(tokenizer.encode(line['response']))))\n",
    "    \n",
    "            \n",
    "            \n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "id": "22c5215e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_aucoaa(data):\n",
    "    # data is a list of (correctness, response_length) tuples\n",
    "    n = len(data)\n",
    "    tmax = max(length for _,_, length in data)\n",
    "    oaa_t_list = []\n",
    "    for t in range(tmax + 1):\n",
    "        filtered = [correct for _,correct, length in data if length < t]\n",
    "        if filtered:\n",
    "            oaa_t = sum(filtered) / n  # Denominator is always n, per definition\n",
    "        else:\n",
    "            oaa_t = 0.0\n",
    "        oaa_t_list.append(oaa_t)\n",
    "    aucoaa = sum(oaa_t_list) / len(oaa_t_list)\n",
    "    return aucoaa\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "id": "08506db8",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from collections import defaultdict\n",
    "\n",
    "def compute_accuracy_from_runs(prompts, accuracies, n_runs=5):\n",
    "    \"\"\"\n",
    "    prompts: list of prompt IDs (each repeated n_runs times)\n",
    "    accuracies: list of 0/1, same length as prompts\n",
    "    n_runs: number of answers per prompt (default 5)\n",
    "    \n",
    "    Returns:\n",
    "        dict with per-answer, per-prompt, and per-run accuracy + std\n",
    "    \"\"\"\n",
    "    assert len(prompts) == len(accuracies), \"Both lists must have the same length\"\n",
    "\n",
    "    accuracies = np.array(accuracies)\n",
    "    total_answers = len(accuracies)\n",
    "\n",
    "    # ------------------\n",
    "    # Per-answer accuracy (binomial estimate)\n",
    "    # ------------------\n",
    "    acc_answer = accuracies.mean()\n",
    "    std_answer = np.sqrt(acc_answer * (1 - acc_answer) / total_answers)\n",
    "\n",
    "    # ------------------\n",
    "    # Per-prompt accuracy\n",
    "    # ------------------\n",
    "    prompt_correct = defaultdict(list)\n",
    "    for p, a in zip(prompts, accuracies):\n",
    "        prompt_correct[p].append(a)\n",
    "\n",
    "    prompt_solved = np.array([np.mean(v) for v in prompt_correct.values()])\n",
    "    acc_prompt = prompt_solved.mean()\n",
    "    std_prompt = prompt_solved.std(ddof=1)\n",
    "\n",
    "    # ------------------\n",
    "    # Per-run accuracy (like papers report)\n",
    "    # ------------------\n",
    "    # initialize n_runs lists (one per run)\n",
    "    run_correct = [[] for _ in range(n_runs)]\n",
    "\n",
    "    for p in set(prompts):\n",
    "        answers = prompt_correct[p]   # length = n_runs\n",
    "        for i, a in enumerate(answers):\n",
    "            run_correct[i].append(a)\n",
    "\n",
    "    run_accs = np.array([np.mean(r) for r in run_correct])\n",
    "    acc_run = run_accs.mean()\n",
    "    std_run = run_accs.std(ddof=1)\n",
    "\n",
    "    return {\n",
    "        \"per_run_accuracy\": acc_run,\n",
    "        \"per_run_std\": std_run,\n",
    "    }\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "9e63045f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data Source:  bbeh\n"
     ]
    },
    {
     "ename": "IndexError",
     "evalue": "list index out of range",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mIndexError\u001b[39m                                Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[126]\u001b[39m\u001b[32m, line 7\u001b[39m\n\u001b[32m      4\u001b[39m accuracy_list = [correct \u001b[38;5;28;01mfor\u001b[39;00m _, correct, _ \u001b[38;5;129;01min\u001b[39;00m correctness_response_length_list[key]]\n\u001b[32m      5\u001b[39m response_length_list = [length \u001b[38;5;28;01mfor\u001b[39;00m _, _, length \u001b[38;5;129;01min\u001b[39;00m correctness_response_length_list[key]]\n\u001b[32m----> \u001b[39m\u001b[32m7\u001b[39m result = \u001b[43mcompute_accuracy_from_runs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprompt_list\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maccuracy_list\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      8\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mAccuracy: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult[\u001b[33m'\u001b[39m\u001b[33mper_run_accuracy\u001b[39m\u001b[33m'\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m      9\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mStd: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult[\u001b[33m'\u001b[39m\u001b[33mper_run_std\u001b[39m\u001b[33m'\u001b[39m]\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[125]\u001b[39m\u001b[32m, line 44\u001b[39m, in \u001b[36mcompute_accuracy_from_runs\u001b[39m\u001b[34m(prompts, accuracies, n_runs)\u001b[39m\n\u001b[32m     42\u001b[39m     answers = prompt_correct[p]   \u001b[38;5;66;03m# length = n_runs\u001b[39;00m\n\u001b[32m     43\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m i, a \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(answers):\n\u001b[32m---> \u001b[39m\u001b[32m44\u001b[39m         \u001b[43mrun_correct\u001b[49m\u001b[43m[\u001b[49m\u001b[43mi\u001b[49m\u001b[43m]\u001b[49m.append(a)\n\u001b[32m     46\u001b[39m run_accs = np.array([np.mean(r) \u001b[38;5;28;01mfor\u001b[39;00m r \u001b[38;5;129;01min\u001b[39;00m run_correct])\n\u001b[32m     47\u001b[39m acc_run = run_accs.mean()\n",
      "\u001b[31mIndexError\u001b[39m: list index out of range"
     ]
    }
   ],
   "source": [
    "for key in correctness_response_length_list.keys():\n",
    "    print(\"Data Source: \", key)\n",
    "    prompt_list = [prompt for prompt, _, _ in correctness_response_length_list[key]]\n",
    "    accuracy_list = [correct for _, correct, _ in correctness_response_length_list[key]]\n",
    "    response_length_list = [length for _, _, length in correctness_response_length_list[key]]\n",
    "\n",
    "    result = compute_accuracy_from_runs(prompt_list, accuracy_list)\n",
    "    print(f\"Accuracy: {result['per_run_accuracy']}\")\n",
    "    print(f\"Std: {result['per_run_std']}\")\n",
    "    print(f\"Response Length: {np.mean(response_length_list)}\")\n",
    "    print(f\"Max Response Length: {max(response_length_list)}\")\n",
    "    print(f\"Min Response Length: {min(response_length_list)}\")\n",
    "\n",
    "    print(\"AUC-OAA: \", compute_aucoaa(correctness_response_length_list[key]))\n",
    "    print(\"--------------------------------\")\n",
    "    \n",
    "    # print(result)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "856e01dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "0.7951807228915663\n",
    "{'per_answer_accuracy': np.float64(0.7951807228915663), 'per_answer_std': np.float64(0.019810447580330987), 'per_prompt_accuracy': np.float64(0.7951807228915663), 'per_prompt_std': np.float64(0.35334761936606585)}\n",
    "0.4720812182741117\n",
    "{'per_answer_accuracy': np.float64(0.4720812182741117), 'per_answer_std': np.float64(0.015906469699511084), 'per_prompt_accuracy': np.float64(0.4720812182741117), 'per_prompt_std': np.float64(0.415861821515785)}\n",
    "0.45454545454545453\n",
    "{'per_answer_accuracy': np.float64(0.45454545454545453), 'per_answer_std': np.float64(0.023737862175776978), 'per_prompt_accuracy': np.float64(0.45454545454545453), 'per_prompt_std': np.float64(0.43599775674554486)}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "id": "3651922f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data Source:  bbeh\n",
      "Accuracy:  0.18405511811023623\n",
      "Response Length:  5440.876968503937\n",
      "Number of Samples:  2032\n",
      "AUC-OAA:  0.1128701425069317\n",
      "--------------------------------\n"
     ]
    }
   ],
   "source": [
    "for key in correctness_response_length_list.keys():\n",
    "    # Each entry is (question, correctness, response_length)\n",
    "    correctness_values = [correct for _, correct, _ in correctness_response_length_list[key]]\n",
    "    response_lengths = [length for _, _, length in correctness_response_length_list[key]]\n",
    "    print(\"Data Source: \", key)\n",
    "    print(\"Accuracy: \", np.mean(correctness_values))\n",
    "    # print(\"Accuracy Std: \", np.std(correctness_values, ddof=1) if len(correctness_values) > 1 else 0.0)\n",
    "    print(\"Response Length: \", np.mean(response_lengths))\n",
    "    # print(\"Response Length Std: \", np.std(response_lengths, ddof=1) if len(response_lengths) > 1 else 0.0)\n",
    "    print(\"Number of Samples: \", len(correctness_response_length_list[key]))\n",
    "    # For AUC-OAA, need (correctness, response_length) tuples\n",
    "    aucoaa_data = [(_, correct, length) for _, correct, length in correctness_response_length_list[key]]\n",
    "    print(\"AUC-OAA: \", compute_aucoaa(aucoaa_data))\n",
    "    print(\"--------------------------------\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ea835a98",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data Source:  math-amc\n",
      "Accuracy:  0.7951807228915663\n",
      "Response Length:  4156.530120481928\n",
      "Number of Samples:  415\n",
      "AUC-OAA:  0.5376238678997872\n",
      "--------------------------------\n",
      "Data Source:  gpqa\n",
      "Accuracy:  0.4720812182741117\n",
      "Response Length:  4193.119796954315\n",
      "Number of Samples:  985\n",
      "AUC-OAA:  0.30289823809497224\n",
      "--------------------------------\n",
      "Data Source:  math-aime\n",
      "Accuracy:  0.45454545454545453\n",
      "Response Length:  6651.779545454546\n",
      "Number of Samples:  440\n",
      "AUC-OAA:  0.24861373632365236\n",
      "--------------------------------\n"
     ]
    }
   ],
   "source": [
    "for key in correctness_response_length_list.keys():\n",
    "    print(\"Data Source: \", key)\n",
    "    print(\"Accuracy: \", np.mean([correct for correct, _ in correctness_response_length_list[key]]))\n",
    "    print(\"Response Length: \", np.mean([length for _, length in correctness_response_length_list[key]]))\n",
    "    print(\"Number of Samples: \", len(correctness_response_length_list[key]))\n",
    "    print(\"AUC-OAA: \", compute_aucoaa(correctness_response_length_list[key]))\n",
    "    print(\"--------------------------------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "7828a9c2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'<think>\\nOkay, so I need to figure out which of the given options A-J have the correct adjective order based on the example sentences provided. The user mentioned that in this variant of English, the adjectives follow a specific order. From the examples given, I need to determine what that order is.\\n\\nLooking at the example sentences, like \"ancient key\" (1), \"pyramidal earring\" (2), \"repulsive normal-size hammer\" (3), etc., I need to find a pattern in the adjectives. The correct adjective order in these examples probably follows a certain sequence, such as opinion, size, age, material, purpose, and then other descriptors.\\n\\nLet me try to deduce the order based on the examples. For instance, in (3) \"repulsive normal-size hammer\", the adjectives are \"repulsive\" (opinion), \"normal-size\" (size), and then \"hammer\" (noun). Similarly, (4) \"red snorkeling bag\" has \"red\" (color), \"snorkeling\" (purpose), then \"bag\". Wait, but that seems different. Wait, maybe the order is opinion, size, age, material, purpose, then other descriptors.\\n\\nWait, looking at example (1): \"ancient key\" – ancient is an adjective of age, then key. But in example (2): \"pyramidal earring\" – pyramidal is a shape descriptor. Maybe the order is opinion, size, age, material, purpose, and then other adjectives. Wait, but how do they fit?\\n\\nAlternatively, perhaps the order is:\\n\\n1. Opinion adjectives (e.g., repulsive, awful, nice, terrible)\\n2. Size adjectives (e.g., large, small, normal-size, medium-size)\\n3. Age adjectives (e.g., ancient, old, old-fashioned)\\n4. Material adjectives (e.g., steel, leather, wool)\\n5. Purpose adjectives (e.g., snorkeling, eating, driving)\\n6. Shape adjectives (e.g., pyramidal, triangular, spherical)\\n\\nBut looking at the example (3): \"repulsive normal-size hammer\" – the order is opinion (repulsive), size (normal-size), then the noun. So maybe that\\'s correct. Then in example (4): \"red snorkeling bag\" – red is a color, which is a material or maybe a different category. Wait, but the example (2): \"pyramidal earring\" – pyramidal is a shape, which is a different category. So maybe the order is:\\n\\nThe adjectives in the correct order are: opinion, size, age, material, purpose, shape.\\n\\nBut I need to check which examples fit this.\\n\\nLooking at example (3): repulsive (opinion), normal-size (size), hammer (noun). So that fits opinion, size.\\n\\nExample (4): red (color, which might be a material or a different category). But maybe color is considered a material? Hmm. Or maybe the order is opinion, size, age, material, purpose, shape. Wait, example (5): square wrench – square is a shape, so maybe that\\'s after purpose. Wait, in example (5), \"square wrench\" – the adjective is shape (square), noun. So perhaps the order is opinion, size, age, material, purpose, shape. But if that\\'s the case, then in the example (5), the adjective is a shape, which would be last. But then example (2) is \"pyramidal earring\" – pyramidal is shape, so it\\'s last. But in that example, the order is shape, then noun. But according to the examples given, the adjectives are placed before the noun, but the order of the adjectives is unclear.\\n\\nWait, looking at the example (1): \"ancient key\" – the adjective is \"ancient\", which is age. So that would be age. Then in example (2): \"pyramidal earring\" – shape. So maybe the order is: opinion, age, size, material, purpose, shape.\\n\\nBut example (3): \"repulsive normal-size hammer\" – opinion, size. Then in example (4): \"red snorkeling bag\" – red is color (maybe a material?), snorkeling is purpose. Then \"bag\".\\n\\nAnother example (7): \"nigerian box\" – \"nigerian\" is nationality, which may be a material? Or maybe not. Wait, but in the given examples, some adjectives are not in the standard order. For instance, \"pyramidal earring\" – pyramidal is shape, so maybe shape is after purpose? Or perhaps the correct order is opinion, size, age, material, purpose, shape. Let me check if that holds.\\n\\nBut looking at (4): \"red snorkeling bag\" – red (color), snorkeling (purpose). But if the order is opinion, size, age, material, purpose, shape, then \"red\" would be a material, but red is color, which is a different category. Hmm.\\n\\nAlternatively, the correct order is:\\n\\nOpinion, size, age, material, purpose, shape.\\n\\nSo for example:\\n\\n(3) repulsive (opinion) normal-size (size) hammer (noun). Correct.\\n\\n(4) red (material?) snorkeling (purpose) bag. But red is color. Maybe that\\'s considered a material? Maybe not. Alternatively, the order could be:\\n\\nOpinion, size, age, shape, material, purpose.\\n\\nBut this is getting confusing. Let\\'s look at example (4): \"red snorkeling bag\" – the adjectives are \"red\" and \"snorkeling\". If the correct order is opinion, size, age, shape, material, purpose, but that doesn\\'t fit here.\\n\\nAlternatively, perhaps the adjectives are ordered as:\\n\\nOpinion (like repulsive, awful), then size (like normal-size), then age (like ancient), then material (like steel), then purpose (like snorkeling), then shape (like pyramidal).\\n\\nSo for example (3): repulsive (opinion), normal-size (size), hammer (noun) – which fits.\\n\\nIn (4): red (color, perhaps considered a material?), snorkeling (purpose), bag. But maybe red is a color, which is a different category. However, in the example (3), the adjectives are opinion and size. So maybe the order is opinion, size, age, material, purpose, shape.\\n\\nBut then in example (4), the adjectives are red (color, maybe material) and snorkeling (purpose). So that would be material (red) and purpose (snorkeling). But according to the example, this is correct. But this might not fit the order.\\n\\nAlternatively, perhaps the adjectives are ordered as: opinion, size, age, material, purpose, shape.\\n\\nSo let\\'s check the example (2): pyramidal earring. Here, \"pyramidal\" is a shape, which would be the last. So if the correct order is opinion, size, age, material, purpose, shape, then pyramidal would be the last adjective. But in this case, \"pyramidal\" is first, which might not fit.\\n\\nAlternatively, maybe the order is: opinion, shape, size, age, material, purpose.\\n\\nBut in that case, example (2) would fit. But example (3) is repulsive (opinion), normal-size (size), so that would be first two. Then the noun.\\n\\nHmm, this is complicated. Maybe the correct order is that opinion, size, age, shape, material, purpose. Or some other order.\\n\\nAlternatively, looking at example (2): \"pyramidal earring\" – the adjective is shape. So perhaps the correct order is that the adjectives are arranged in a specific order, and the examples provided show that the order is: opinion, size, age, material, purpose, shape.\\n\\nWait, example (2) is pyramidal, which is shape. If the correct order is that shape comes after purpose, then that would not fit. But maybe the order is different.\\n\\nAlternatively, perhaps the adjectives in the examples are ordered in the following way:\\n\\n- Opinion (like repulsive, awful, nice, etc.)\\n- Size (like large, small, normal-size, etc.)\\n- Age (like ancient, old, old-fashioned)\\n- Material (like steel, leather, wool)\\n- Purpose (like snorkeling, eating, driving)\\n- Shape (like pyramidal, triangular, spherical)\\n\\nBut in the example (3), the adjectives are repulsive (opinion) and normal-size (size), then the noun. So that\\'s correct.\\n\\nIn example (4): red (color) is a material? Or maybe that\\'s a different category. However, in the example (4), \"red snorkeling bag\" is given as correct. So red is a color, and snorkeling is a purpose. But according to the order above, the correct order would be opinion, size, age, material, purpose, shape. But in example (4), the adjectives are red (color) and snorkeling (purpose). However, since red is a color, maybe that is considered a material.\\n\\nAlternatively, the order might be opinion, size, age, shape, material, purpose. But I\\'m not sure.\\n\\nAlternatively, maybe the correct order is: opinion, size, age, material, purpose, shape.\\n\\nBut how to check? Let\\'s look at example (1): ancient key – ancient (age), key (noun). So that\\'s age first. But in the example (3), the adjectives are opinion and size. So the order is opinion, size, age, etc. That seems inconsistent.\\n\\nAlternatively, maybe the correct order is:\\n\\nOpinion (like repulsive, awful, nice, etc.)\\n\\nSize (like large, small, normal-size)\\n\\nAge (ancient, old, old-fashioned)\\n\\nMaterial (steel, leather, wool)\\n\\nPurpose (snorkeling, eating, driving)\\n\\nShape (pyramidal, triangular, spherical)\\n\\nSo in that order, the adjectives should be ordered as opinion, size, age, material, purpose, shape. Then, in example (2) \"pyramidal earring\" would have the adjective as shape, which would be last. But in that example, the adjective is first. So that would not fit.\\n\\nAlternatively, maybe the correct order is: opinion, size, age, material, purpose, shape. But in example (2), the adjective is shape, so that would be placed at the end, but in the example, it\\'s first. So this is conflicting.\\n\\nAlternatively, maybe the adjectives are ordered based on their category, and the correct order is:\\n\\n1. Opinion (e.g., repulsive, awful, nice, terrible)\\n2. Size (e.g., large, small, normal-size)\\n3. Age (e.g., ancient, old, old-fashioned)\\n4. Shape (e.g., pyramidal, triangular, spherical)\\n5. Material (e.g., steel, leather, wool)\\n6. Purpose (e.g., snorkeling, eating, driving)\\n\\nSo, for example, (2) \"pyramidal earring\" would have shape as the first adjective. Then in (3) \"repulsive normal-size hammer\" would be opinion, size.\\n\\nThis might fit. Let\\'s check example (2): \"pyramidal earring\" – shape, then noun. That would fit if shape is first. So the order is shape first.\\n\\nBut in example (3), the adjectives are opinion and size. So that would be opinion first, then size. So this order would not be consistent.\\n\\nAlternatively, the correct order is that the adjectives are ordered as: opinion, size, age, material, purpose, shape. But example (2) has shape first, which would not fit.\\n\\nAlternatively, maybe the correct order is: shape, material, purpose, size, age, opinion. But this is just a guess.\\n\\nThis is really confusing. Maybe I should check the given examples again for any patterns.\\n\\nLooking at example (4): \"red snorkeling bag\" – red (color, material?), snorkeling (purpose), bag.\\n\\nExample (5): \"square wrench\" – square (shape), wrench.\\n\\nExample (6): \"awful large apple\" – awful (opinion), large (size), apple.\\n\\nSo in example (6), the adjectives are opinion and size.\\n\\nIn example (7): \"nigerian box\" – \"nigerian\" is nationality, which is a material? Or maybe not. So perhaps in this case, it\\'s a material.\\n\\nBut the problem is that the correct order is not clear. Maybe the answer is to check if the adjectives in the options are in the order that is consistent with the given examples.\\n\\nBut since the examples are given, perhaps the correct adjective order is as follows, based on the examples:\\n\\nThe order is opinion, size, age, material, purpose, shape.\\n\\nSo, in the given examples:\\n\\n(1) ancient key – ancient (age), key.\\n\\nBut that would be age, which is first. So that might not fit. But maybe the order is age, then opinion?\\n\\nAlternatively, the correct order is opinion, size, age, material, purpose, shape.\\n\\nBut example (1) has \"ancient\" (age), so that would be the third position. But example (6) is \"awful large apple\" – opinion (awful), size (large), then apple.\\n\\nSo that fits the order opinion, size.\\n\\nIn example (2): pyramidal earring – shape, then noun. So shape is the last. So according to the order, if the order is opinion, size, age, material, purpose, shape, then shape should be last. So that would fit.\\n\\nSo for example (2), the adjective is shape, so it\\'s last. That would fit.\\n\\nSo maybe the correct order is: opinion, size, age, material, purpose, shape.\\n\\nBut in example (1), the adjective is \"ancient\", which is age. So in this case, the adjective is age. So that would be third.\\n\\nBut in the example (1), it\\'s \"ancient key\" – so the adjective is age. So that would fit in the third position.\\n\\nIn example (2): pyramidal earring – shape is last.\\n\\nIn example (3): \"repulsive normal-size hammer\" – opinion, then size.\\n\\nSo that fits the first two positions.\\n\\nIn example (4): red snorkeling bag – red (color, which might be material), snorkeling (purpose). So according to the order, material should come before purpose. But red is color, which is a material. Then snorkeling is purpose. So the order would be material (red), then purpose (snorkeling). So that fits.\\n\\nExample (5): square wrench – shape (square) is last.\\n\\nExample (6): awful large apple – opinion, size.\\n\\nSo the order seems to be opinion, size, age, material, purpose, shape.\\n\\nTherefore, in the options given, each option should have adjectives in that order.\\n\\nSo let\\'s analyze each option:\\n\\n(A) iranian square violet smoking old-fashioned rubber repulsive large screwdriver\\n\\nLet\\'s break down the adjectives:\\n\\n- iranian (nationality, maybe material?)\\n- square (shape)\\n- violet (color, material)\\n- smoking (purpose)\\n- old-fashioned (age)\\n- rubber (material)\\n- repulsive (opinion)\\n- large (size)\\n- screwdriver\\n\\nWait, but according to the order opinion, size, age, material, purpose, shape, this should be:\\n\\nOpinion: repulsive\\nSize: large\\nAge: old-fashioned\\nMaterial: rubber, violet (color)\\nPurpose: smoking\\nShape: square\\n\\nBut in the option, the order is: iranian, square, violet, smoking, old-fashioned, rubber, repulsive, large, screwdriver.\\n\\nThis is not in the correct order. Because the opinion should be first, then size, etc.\\n\\nSo the correct order for this would be repulsive (opinion), large (size), old-fashioned (age), rubber (material), smoking (purpose), square (shape), and then the noun. But in the option, the order is different.\\n\\nTherefore, option A is incorrect.\\n\\n(B) mysterious little maroon whittling ancient cloth prismlike brush\\n\\nAdjectives: mysterious (opinion), little (size), maroon (color/material), whittling (purpose), ancient (age), cloth (material), prismlike (shape)\\n\\nAccording to the order, the correct sequence would be opinion (mysterious), size (little), age (ancient), material (cloth), purpose (whittling), shape (prismlike), and then the noun. But in this option, the order is:\\n\\nmysterious (opinion), little (size), maroon (color, material), whittling (purpose), ancient (age), cloth (material), prismlike (shape). But here, the age comes after the purpose. That\\'s wrong. The correct order would be opinion, size, age, material, purpose, shape. In this case, the age (ancient) is after the purpose (whittling), which is incorrect. So option B is incorrect.\\n\\n(C) silly midsize crimson eating archaic leather square brazilian banana\\n\\nAdjectives: silly (opinion), midsize (size), crimson (color/material), eating (purpose), archaic (age), leather (material), square (shape), brazilian (nationality/material)\\n\\nCorrect order should be opinion (silly), size (midsize), age (archaic), material (leather, crimson, brazilian?), purpose (eating), shape (square). But in this option, after size (midsize), we have crimson (color), then eating (purpose), then archaic (age), which is incorrect. Because age should come before purpose. So the order is incorrect. So option C is incorrect.\\n\\n(D) tiny repulsive maroon drinking ancient glass pyramidal congolese fork\\n\\nAdjectives: tiny (size), repulsive (opinion), maroon (color/material), drinking (purpose), ancient (age), glass (material), pyramidal (shape), congolese (nationality)\\n\\nCorrect order is opinion (repulsive), size (tiny), age (ancient), material (glass), purpose (drinking), shape (pyramidal), and then nationality. But in the option, the order is size (tiny), opinion (repulsive), which is incorrect. The opinion should come before size. So this is wrong. Thus, D is incorrect.\\n\\n(E) green snorkeling archaic iron large awful toolbox\\n\\nAdjectives: green (color/material), snorkeling (purpose), archaic (age), iron (material), large (size), awful (opinion), toolbox.\\n\\nAccording to the order, opinion should be first, then size, then age, then material, then purpose. But here, the order is green (color/material), snorkeling (purpose), archaic (age), iron (material), large (size), awful (opinion). This is incorrect because opinion (awful) is last. So E is incorrect.\\n\\n(F) filipino spherical yellow exercise ancient wool medium-size good calculator\\n\\nAdjectives: filipino (nationality), spherical (shape), yellow (color), exercise (purpose), ancient (age), wool (material), medium-size (size), good (opinion), calculator.\\n\\nCorrect order: opinion (good), size (medium-size), age (ancient), material (wool), purpose (exercise), shape (spherical), then nationality. But in this option, the order is filipino, spherical, yellow, exercise, ancient, wool, medium-size, good. So the opinion (good) is last, which is wrong. Also, size (medium-size) is after material (wool), which is incorrect. So F is incorrect.\\n\\n(G) triangular violet drinking brand-new fiberglass small repulsive watch\\n\\nAdjectives: triangular (shape), violet (color/material), drinking (purpose), brand-new (age), fiberglass (material), small (size), repulsive (opinion), watch.\\n\\nThe order should be opinion (repulsive), size (small), age (brand-new), material (fiberglass), purpose (drinking), shape (triangular). But in the option, it\\'s triangular, violet, drinking, brand-new, fiberglass, small, repulsive. This is incorrect because the opinion is last. So G is incorrect.\\n\\n(H) russian prismlike aqua snorkeling old cloth house\\n\\nAdjectives: russian (nationality), prismlike (shape), aqua (color), snorkeling (purpose), old (age), cloth (material), house.\\n\\nThe correct order should be opinion (none here), then size (none), age (old), material (cloth), purpose (snorkeling), shape (prismlike). But the order is different. So this is incorrect. Also, \\'russian\\' is nationality, which might not fit the order.\\n\\n(I) massive lovely gray walking new cloth pen\\n\\nAdjectives: massive (size), lovely (opinion), gray (color), walking (purpose), new (age), cloth (material), pen.\\n\\nThe correct order would be opinion (lovely), size (massive), age (new), material (cloth), purpose (walking). But in this option, the order is size (massive), opinion (lovely), which is incorrect. Because opinion should come before size. So I is incorrect.\\n\\n(J) wonderful tiny gray walking archaic wood brush\\n\\nAdjectives: wonderful (opinion), tiny (size), gray (color), walking (purpose), archaic (age), wood (material), brush.\\n\\nThe order should be opinion (wonderful), size (tiny), age (archaic), material (wood), purpose (walking). The order here is correct. So J is correct.\\n\\nWait, let me check:\\n\\nThe adjectives are wonderful (opinion), tiny (size), gray (color, material), walking (purpose), archaic (age), wood (material), brush.\\n\\nBut the correct order should be opinion, size, age, material, purpose, shape. Wait, but in this case, the adjectives are:\\n\\nopinion: wonderful\\n\\nsize: tiny\\n\\nage: archaic\\n\\nmaterial: wood, gray?\\n\\nWait, gray is a color, which is part of material or color. In the correct order, after age comes material. So here, the order is:\\n\\nwonderful (opinion), tiny (size), archaic (age), gray (color, material), walking (purpose), wood (material), brush.\\n\\nBut in the option, the order is wonderful, tiny, gray, walking, archaic, wood. So the age (archaic) is after walking (purpose), which is incorrect. So J is incorrect.\\n\\nWait, no. Let\\'s check again. The option is:\\n\\nwonderful tiny gray walking archaic wood brush.\\n\\nThe adjectives are: wonderful (opinion), tiny (size), gray (color), walking (purpose), archaic (age), wood (material), brush.\\n\\nAccording to the correct order: opinion, size, age, material, purpose, shape.\\n\\nBut in this case, the order is opinion, size, then gray (color, which is material), walking (purpose), archaic (age), wood (material). Wait, this is incorrect because age should come before purpose. So the correct order would be opinion, size, age, material, purpose, shape. But in this option, the age (archaic) comes after the purpose (walking), which is wrong.\\n\\nSo J is also incorrect.\\n\\nWait, but the correct order is opinion, size, age, material, purpose, shape. So if the adjectives are arranged in that order, then J is incorrect. But if the order is different, perhaps I\\'m missing something.\\n\\nAlternatively, maybe the order is opinion, size, material, age, purpose, shape. But that\\'s not matching the examples.\\n\\nAlternatively, maybe the correct order is: opinion, size, age, purpose, material, shape.\\n\\nBut example (3) has opinion, size, then the noun. So that\\'s correct. But this is confusing.\\n\\nGiven the confusion, perhaps the correct answer is only option J, but I\\'m not sure. Alternatively, maybe none of them are correct. But let me check again.\\n\\nOption J is:\\n\\nwonderful (opinion), tiny (size), gray (color), walking (purpose), archaic (age), wood (material), brush.\\n\\nThe order would be opinion, size, purpose, age, material. That\\'s incorrect. So J is incorrect.\\n\\nAlternatively, maybe the correct order is opinion, size, material, purpose, age, shape. But that\\'s not matching any examples.\\n\\nGiven the confusion, maybe none of the options are correct. So the answer is K.\\n\\nBut let me check each option again more carefully.\\n\\nFor example, let\\'s recheck option J:\\n\\nwonderful (opinion), tiny (size), gray (color, material), walking (purpose), archaic (age), wood (material), brush.\\n\\nThe correct order should be opinion, size, age, material, purpose, shape. So here, the age (archaic) is after the purpose (walking), which is incorrect. So option J is wrong.\\n\\nLooking at all options, none seem to fit the correct order. Therefore, the answer is K.\\n\\nBut wait, let me check option (J) again. Maybe the age is supposed to come before purpose. So if the adjectives are in the order opinion, size, age, material, purpose, shape, then the correct order is:\\n\\nopinion, size, age, material, purpose, shape.\\n\\nIn option J, the order is: opinion (wonderful), size (tiny), then gray (color/material), then walking (purpose), then archaic (age), then wood (material). That would be opinion, size, purpose, age, material. Which is wrong.\\n\\nSo none of the options are correct. Therefore, the answer is K.\\n</think>\\n\\nTo determine which of the given options use the correct adjective order in this variant of English, we need to refer to the pattern in the examples provided. Based on the examples, the correct adjective order appears to be:\\n\\n1. **Opinion adjectives** (e.g., *wonderful, awful, nice, repulsive*)\\n2. **Size adjectives** (e.g., *large, small, normal-size, medium-size*)\\n3. **Age adjectives** (e.g., *ancient, old, old-fashioned, new*)\\n4. **Material adjectives** (e.g., *steel, leather, wood, cloth*)\\n5. **Purpose adjectives** (e.g., *snorkeling, eating, driving, walking*)\\n6. **Shape adjectives** (e.g., *pyramidal, triangular, spherical*)\\n\\nThis order is consistent across the examples provided. It starts with **opinion**, followed by **size**, then **age**, then **material**, then **purpose**, and finally **shape**.\\n\\n---\\n\\nLet\\'s analyze the options based on this order:\\n\\n### (A) iranian square violet smoking old-fashioned rubber repulsive large screwdriver  \\n- Adjectives: *iranian (nationality, likely material), square (shape), violet (color, likely material), smoking (purpose), old-fashioned (age), rubber (material), repulsive (opinion), large (size), screwdriver*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes last. Incorrect.\\n\\n---\\n\\n### (B) mysterious little maroon whittling ancient cloth prismlike brush  \\n- Adjectives: *mysterious (opinion), little (size), maroon (color, material), whittling (purpose), ancient (age), cloth (material), prismlike (shape), brush*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **age** comes after **purpose**. Incorrect.\\n\\n---\\n\\n### (C) silly midsize crimson eating archaic leather square brazilian banana  \\n- Adjectives: *silly (opinion), midsize (size), crimson (color, material), eating (purpose), archaic (age), leather (material), square (shape), brazilian (nationality)*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **age** comes after **purpose**. Incorrect.\\n\\n---\\n\\n### (D) tiny repulsive maroon drinking ancient glass pyramidal congolese fork  \\n- Adjectives: *tiny (size), repulsive (opinion), maroon (color, material), drinking (purpose), ancient (age), glass (material), pyramidal (shape), congolese (nationality)*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes after **size**. Incorrect.\\n\\n---\\n\\n### (E) green snorkeling archaic iron large awful toolbox  \\n- Adjectives: *green (color, material), snorkeling (purpose), archaic (age), iron (material), large (size), awful (opinion), toolbox*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes last. Incorrect.\\n\\n---\\n\\n### (F) filipino spherical yellow exercise ancient wool medium-size good calculator  \\n- Adjectives: *filipino (nationality), spherical (shape), yellow (color, material), exercise (purpose), ancient (age), wool (material), medium-size (size), good (opinion), calculator*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes last. Incorrect.\\n\\n---\\n\\n### (G) triangular violet drinking brand-new fiberglass small repulsive watch  \\n- Adjectives: *triangular (shape), violet (color, material), drinking (purpose), brand-new (age), fiberglass (material), small (size), repulsive (opinion), watch*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes last. Incorrect.\\n\\n---\\n\\n### (H) russian prismlike aqua snorkeling old cloth house  \\n- Adjectives: *russian (nationality), prismlike (shape), aqua (color, material), snorkeling (purpose), old (age), cloth (material), house*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** is missing and the rest are in incorrect order. Incorrect.\\n\\n---\\n\\n### (I) massive lovely gray walking new cloth pen  \\n- Adjectives: *massive (size), lovely (opinion), gray (color, material), walking (purpose), new (age), cloth (material), pen*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **opinion** comes after **size**. Incorrect.\\n\\n---\\n\\n### (J) wonderful tiny gray walking archaic wood brush  \\n- Adjectives: *wonderful (opinion), tiny (size), gray (color, material), walking (purpose), archaic (age), wood (material), brush*\\n\\nThe correct order should be: **opinion → size → age → material → purpose → shape**  \\nHere, **age** comes after **purpose**. Incorrect.\\n\\n---\\n\\n### Conclusion:\\n\\nNone of the options (A–J) follow the correct adjective order as deduced from the examples. Therefore, the correct answer is:\\n\\n$$\\n\\\\boxed{K}\\n$$'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "line['response']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "fa28c910",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4407"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(tokenizer.encode(line['response']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "d81e6485",
   "metadata": {},
   "outputs": [
    {
     "ename": "FileNotFoundError",
     "evalue": "[Errno 2] No such file or directory: 'your_file.jsonl'",
     "output_type": "error",
     "traceback": [
      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
      "\u001b[31mFileNotFoundError\u001b[39m                         Traceback (most recent call last)",
      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[37]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mjson\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;66;03m## read jsonl\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43myour_file.jsonl\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mr\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[32m      4\u001b[39m     data = [json.loads(line) \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m f]\n",
      "\u001b[36mFile \u001b[39m\u001b[32mNONEcode/verl-fork/verl/venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py:343\u001b[39m, in \u001b[36m_modified_open\u001b[39m\u001b[34m(file, *args, **kwargs)\u001b[39m\n\u001b[32m    336\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[32m0\u001b[39m, \u001b[32m1\u001b[39m, \u001b[32m2\u001b[39m}:\n\u001b[32m    337\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m    338\u001b[39m         \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mIPython won\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m by default \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    339\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    340\u001b[39m         \u001b[33m\"\u001b[39m\u001b[33myou can use builtins\u001b[39m\u001b[33m'\u001b[39m\u001b[33m open.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    341\u001b[39m     )\n\u001b[32m--> \u001b[39m\u001b[32m343\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[31mFileNotFoundError\u001b[39m: [Errno 2] No such file or directory: 'your_file.jsonl'"
     ]
    }
   ],
   "source": [
    "import json\n",
    "## read jsonl\n",
    "with open('your_file.jsonl', 'r') as f:\n",
    "    data = [json.loads(line) for line in f]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d0055897",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "55f7dbede25c49ef92fd461aeed8a3f9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/2499 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "train_dataset = load_dataset('NONEcode/verl-fork/baselines/LC-R1/dataset', split='train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "bd778c0c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['problem', 'solution'],\n",
       "    num_rows: 2499\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b70c8c1f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'problem': 'Let $x$ , $y$ and $z$ all exceed $1$ and let $w$ be a positive number such that $\\\\log_xw=24$ , $\\\\log_y w = 40$ and $\\\\log_{xyz}w=12$ . Find $\\\\log_zw$ .',\n",
       " 'solution': '60'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "14e28692",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>data_source</th>\n",
       "      <th>prompt</th>\n",
       "      <th>ability</th>\n",
       "      <th>reward_model</th>\n",
       "      <th>extra_info</th>\n",
       "      <th>uuid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'In triangle $ABC$, $\\sin \\angle ...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '34', 'style': 'rule-lighteva...</td>\n",
       "      <td>{'index': 0, 'split': 'train'}</td>\n",
       "      <td>d2ccd328-e712-4d6b-b909-a7956589db4e</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'Let $ABCD$ be a unit square in t...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '113', 'style': 'rule-lightev...</td>\n",
       "      <td>{'index': 1, 'split': 'train'}</td>\n",
       "      <td>5fb8b773-1e5f-48ab-b5ce-8d2ba6ed53cc</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'Let $a, b, c$ be distinct number...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '-3', 'style': 'rule-lighteva...</td>\n",
       "      <td>{'index': 2, 'split': 'train'}</td>\n",
       "      <td>974a3162-fb31-4720-86ab-6a3738ae84e7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'There are $7$ boxes arranged in ...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '3', 'style': 'rule-lighteval...</td>\n",
       "      <td>{'index': 3, 'split': 'train'}</td>\n",
       "      <td>c47003a1-29c7-46ad-b566-977ba9018acb</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>DAPO-Math-17k</td>\n",
       "      <td>[{'content': 'Let $S$ be the set of triples $(...</td>\n",
       "      <td>math</td>\n",
       "      <td>{'ground_truth': '37', 'style': 'rule-lighteva...</td>\n",
       "      <td>{'index': 4, 'split': 'train'}</td>\n",
       "      <td>f8e041a5-2848-4dfb-99c4-74aa70c8d211</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     data_source                                             prompt ability  \\\n",
       "0  DAPO-Math-17k  [{'content': 'In triangle $ABC$, $\\sin \\angle ...    math   \n",
       "1  DAPO-Math-17k  [{'content': 'Let $ABCD$ be a unit square in t...    math   \n",
       "2  DAPO-Math-17k  [{'content': 'Let $a, b, c$ be distinct number...    math   \n",
       "3  DAPO-Math-17k  [{'content': 'There are $7$ boxes arranged in ...    math   \n",
       "4  DAPO-Math-17k  [{'content': 'Let $S$ be the set of triples $(...    math   \n",
       "\n",
       "                                        reward_model  \\\n",
       "0  {'ground_truth': '34', 'style': 'rule-lighteva...   \n",
       "1  {'ground_truth': '113', 'style': 'rule-lightev...   \n",
       "2  {'ground_truth': '-3', 'style': 'rule-lighteva...   \n",
       "3  {'ground_truth': '3', 'style': 'rule-lighteval...   \n",
       "4  {'ground_truth': '37', 'style': 'rule-lighteva...   \n",
       "\n",
       "                       extra_info                                  uuid  \n",
       "0  {'index': 0, 'split': 'train'}  d2ccd328-e712-4d6b-b909-a7956589db4e  \n",
       "1  {'index': 1, 'split': 'train'}  5fb8b773-1e5f-48ab-b5ce-8d2ba6ed53cc  \n",
       "2  {'index': 2, 'split': 'train'}  974a3162-fb31-4720-86ab-6a3738ae84e7  \n",
       "3  {'index': 3, 'split': 'train'}  c47003a1-29c7-46ad-b566-977ba9018acb  \n",
       "4  {'index': 4, 'split': 'train'}  f8e041a5-2848-4dfb-99c4-74aa70c8d211  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## load parquet dataset\n",
    "import pandas as pd\n",
    "dataset = pd.read_parquet('NONEcode/verl-fork/verl/scripts/data/dapo-17k/train.parquet')\n",
    "dataset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "51a71306",
   "metadata": {},
   "outputs": [],
   "source": [
    "# convert this dataset into a dataset\n",
    "from datasets import Dataset\n",
    "dataset = Dataset.from_pandas(dataset)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "322a257a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info', 'uuid'],\n",
       "    num_rows: 20000\n",
       "})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "61da7071",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ab05b6d81e894388baf96b5dbcfc88f3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset = load_dataset('parquet', data_files='NONEcode/verl-fork/verl/scripts/data/dapo-17k/train.parquet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "74c16f7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "### remove some columsn from the dataset\n",
    "dataset = dataset.remove_columns(['data_source', 'ability', 'extra_info', 'uuid'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ddacf91f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'prompt': [{'content': \"In triangle $ABC$, $\\\\sin \\\\angle A = \\\\frac{4}{5}$ and $\\\\angle A < 90^\\\\circ$. Let $D$ be a point outside triangle $ABC$ such that $\\\\angle BAD = \\\\angle DAC$ and $\\\\angle BDC = 90^\\\\circ$. Suppose that $AD = 1$ and that $\\\\frac{BD}{CD} = \\\\frac{3}{2}$. If $AB + AC$ can be expressed in the form $\\\\frac{a\\\\sqrt{b}}{c}$ where $a, b, c$ are pairwise relatively prime integers, find $a + b + c$. Let's think step by step and output the final answer within \\\\boxed{}.\",\n",
       "   'role': 'user'}],\n",
       " 'reward_model': {'ground_truth': '34', 'style': 'rule-lighteval/MATH_v2'}}"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset['train'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "b06bc9f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset['train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "5646a45f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a new column with problem name from prompt\n",
    "dataset = dataset.add_column('problem', [d['prompt'][0]['content'].replace(\"Let's think step by step and output the final answer within \\\\boxed{}.\", '').strip() for d in dataset])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "ab61e1cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset.add_column('solution', [d['reward_model']['ground_truth'] for d in dataset])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "d949b790",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = dataset.remove_columns(['prompt', 'reward_model'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "26564142",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['problem', 'solution'],\n",
       "    num_rows: 20000\n",
       "})"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "868bb2f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "### save the dataset locally in dataset format 'NONEcode/verl-fork/baselines/LC-R1/dapo-17k'\n",
    "from datasets import load_dataset, load_from_disk\n",
    "# dataset.save_to_disk('NONEcode/verl-fork/baselines/LC-R1/dapo-17k')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "93053ad7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load the dataset from the local dataset format\n",
    "dataset = load_from_disk('NONEcode/verl-fork/baselines/LC-R1/dapo-17k')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8960bb8b",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
