{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "70847e56",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_source = ['underthink-bench']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d3a3cc4c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO 09-11 00:19:05 [importing.py:53] Triton module has been replaced with a placeholder.\n",
      "INFO 09-11 00:19:06 [__init__.py:239] Automatically detected platform cuda.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "os.environ['HF_HOME'] = 'NONE.cache/huggingface'\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '7'\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
    "from vllm import LLM\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
    "# Qwen/Qwen3-4B\n",
    "# deepseek-ai/DeepSeek-R1-Distill-Llama-8B\n",
    "# model = LLM(model=\"Qwen/Qwen3-4B\", gpu_memory_utilization=0.95, tensor_parallel_size=1, task=\"embed\")\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "fa2e6ecd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "data = []\n",
    "with open('NONEcode/verl-fork/verl/scripts/train/verlCheckpoint/Summary/deepseek_qwen_7b_dapo_math_10k_context_linear_reward_with_summary_attention_loss_original_v2/rollout/val_140_v2.jsonl', 'r') as f:\n",
    "    for line in f:\n",
    "        data.append(json.loads(line))\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "c34625e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def last_boxed_only_string(string):\n",
    "    idx = string.rfind(\"\\\\boxed\")\n",
    "    if idx < 0:\n",
    "        idx = string.rfind(\"\\\\fbox\")\n",
    "        if idx < 0:\n",
    "            return None\n",
    "\n",
    "    i = idx\n",
    "    right_brace_idx = None\n",
    "    num_left_braces_open = 0\n",
    "    while i < len(string):\n",
    "        if string[i] == \"{\":\n",
    "            num_left_braces_open += 1\n",
    "        if string[i] == \"}\":\n",
    "            num_left_braces_open -= 1\n",
    "            if num_left_braces_open == 0:\n",
    "                right_brace_idx = i\n",
    "                break\n",
    "        i += 1\n",
    "    if right_brace_idx is None:\n",
    "        retval = None\n",
    "    else:\n",
    "        retval = string[idx : right_brace_idx + 1]\n",
    "\n",
    "    return retval\n",
    "\n",
    "\n",
    "def remove_boxed(s):\n",
    "    left = \"\\\\boxed{\"\n",
    "    try:\n",
    "        assert s[: len(left)] == left\n",
    "        assert s[-1] == \"}\"\n",
    "        return s[len(left) : -1]\n",
    "    except Exception:\n",
    "        return None\n",
    "\n",
    "\n",
    "def extract_boxed_answer(solution: str) -> str:\n",
    "    \"\"\"Extract the answer from inside a LaTeX \\\\boxed{} command\"\"\"\n",
    "    solution = last_boxed_only_string(solution)\n",
    "    solution = remove_boxed(solution)\n",
    "    return solution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "bffef363",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "with open('NONEcode/verl-fork/verl/scripts/data/optimalThinkingBench/underthink_bench.json', 'r') as f:\n",
    "    reference_dataset = json.load(f)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "0b5e07e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': \"A::B is a system with 4 tokens: `A#`, `#A`, `B#` and `#B`.\\n\\nAn A::B program is a sequence of tokens. Example:\\n\\n    B# A# #B #A B#\\n\\nTo *compute* a program, we must rewrite neighbor tokens, using the rules:\\n\\n    A# #A ... becomes ... nothing\\n    A# #B ... becomes ... #B A#\\n    B# #A ... becomes ... #A B#\\n    B# #B ... becomes ... nothing\\n\\nIn other words, whenever two neighbor tokens have their '#' facing each-other,\\nthey must be rewritten according to the corresponding rule.\\n\\nNow, consider the following program:\\n\\nB# A# B# #A #B A# A# #B A# #A B# B# #A #A A# #B #A A# #A B#\\n\\nReturn the final state of the program.\\n\",\n",
       " 'answer': 'B# B#',\n",
       " 'metadata': {'source_dataset': 'ab',\n",
       "  'source_index': 0,\n",
       "  'difficulty': {'length': 20}},\n",
       " 'puzzle': 'ab',\n",
       " 'uuid': '2d5bf0ea-2256-4e93-8a04-31a2a98b86df'}"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "reference_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "e6dd28a6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data_source': 'underthink-bench',\n",
       " 'prompt': \"<｜User｜>A::B is a system with 4 tokens: `A#`, `#A`, `B#` and `#B`.\\n\\nAn A::B program is a sequence of tokens. Example:\\n\\n    B# A# #B #A B#\\n\\nTo *compute* a program, we must rewrite neighbor tokens, using the rules:\\n\\n    A# #A ... becomes ... nothing\\n    A# #B ... becomes ... #B A#\\n    B# #A ... becomes ... #A B#\\n    B# #B ... becomes ... nothing\\n\\nIn other words, whenever two neighbor tokens have their '#' facing each-other,\\nthey must be rewritten according to the corresponding rule.\\n\\nNow, consider the following program:\\n\\nA# A# A# #A #A B# B# A# #A B# B# #A B# #A #A #A A# A# B# B#\\n\\nReturn the final state of the program.\\n \\nLet's think step by step and output the final answer within \\\\boxed{}.<｜Assistant｜><think>\\n\",\n",
       " 'response': \"Okay, so I have this problem where I need to compute the final state of an A::B program after applying certain rewrite rules. The program given is a long sequence of tokens: \\n\\nA# A# A# #A #A B# B# A# #A B# B# #A B# #A #A #A A# A# B# B#\\n\\nFirst, let me rewrite this sequence for clarity:\\n\\nA# A# A# #A #A B# B# A# #A B# B# #A B# #A #A #A A# A# B# B#\\n\\nI remember that the rules to apply are when two neighboring tokens have their '#' facing each other. The rules are as follows:\\n\\n1. A# #A becomes nothing.\\n2. A# #B becomes #B A#.\\n3. B# #A becomes #A B#.\\n4. B# #B becomes nothing.\\n\\nSo, whenever a pair of neighboring tokens follows any of these patterns, we replace them according to the corresponding rule. Also, after replacement, we need to check the new neighbors for any possible transformations because the replacement might cause new adjacent tokens that match the rules.\\n\\nThis process is applied repeatedly until there are no more applicable rules to apply. Let me see how I can approach this step by step.\\n\\nFirst, let's parse the original program into individual tokens:\\n\\n1. A#\\n2. A#\\n3. A#\\n4. #A\\n5. #A\\n6. B#\\n7. B#\\n8. A#\\n9. #A\\n10. B#\\n11. B#\\n12. #A\\n13. B#\\n14. #A\\n15. #A\\n16. #A\\n17. A#\\n18. A#\\n19. B#\\n20. B#\\n\\nSo, the sequence is 20 tokens long. Let me track the state step by step.\\n\\nI think the best way is to scan the sequence from left to right, checking each pair of neighboring tokens and applying the rules as we go. But sometimes, the replacement could affect the previous tokens, so after each replacement, I might need to go back and check the modified areas.\\n\\nAlternatively, maybe a better approach is to look for the leftmost applicable pair and replace it, then continue this process until no more replacements are possible. This is similar to evaluating expressions with operator precedence, applying the leftmost transformation first.\\n\\nLet me try to do that.\\n\\nFirst, scan the initial sequence:\\n\\nToken 1: A#\\nToken 2: A#\\nToken 3: A#\\nToken 4: #A\\nToken 5: #A\\nToken 6: B#\\nToken 7: B#\\nToken 8: A#\\nToken 9: #A\\nToken 10: B#\\nToken 11: B#\\nToken 12: #A\\nToken 13: B#\\nToken 14: #A\\nToken 15: #A\\nToken 16: #A\\nToken 17: A#\\nToken 18: A#\\nToken 19: B#\\nToken 20: B#\\n\\nLet me go through each pair starting from the first token.\\n\\n1. Token1 - A# and Token2 - A#: Both have A#, but they are in the same direction. The rule A# #A becomes nothing, but they are the same, not opposite. So I think this is A# followed by A#, which doesn't match any rule because the rule requires the '#' to be facing each other, which would be A# followed by #A or vice versa. So no operation here.\\n\\n2. Token2 - A# and Token3 - A#: Same as above, no operation.\\n\\n3. Token3 - A# and Token4 - #A: Now, here we have A# followed by #A. That's pair number 3 and 4. According to rule 1, A# #A becomes nothing. So tokens 3 and 4 are replaced by nothing. So we remove these two tokens.\\n\\nNow, after this replacement, the sequence becomes:\\n\\n1. A#\\n2. A#\\n3. [Removed]\\n4. [Removed]\\n5. #A\\n6. B#\\n7. B#\\n8. A#\\n9. #A\\n10. B#\\n11. B#\\n12. #A\\n13. B#\\n14. #A\\n15. #A\\n16. #A\\n17. A#\\n18. A#\\n19. B#\\n20. B#\\n\\nSo tokens 3 and 4 are removed, and the sequence's tokens after removal will have the rest continue from token5. But wait, token5 is #A. So now, we need to check the new neighbors after token2 and token5. Because when tokens3 and 4 are removed, token2's right neighbor is token5.\\n\\nToken2 - A# and Token5 - #A: A# followed by #A, which is exactly rule1. They become nothing. So in addition to removing tokens3 and4, we also need to remove tokens2 and5.\\n\\nSo after this step, tokens1, no need for further, token2, token3,4 gone, token5,6, etc.\\n\\nWait, let me be precise.\\n\\nOriginal sequence:\\n\\n1. A#\\n2. A#\\n3. A#\\n4. #A\\n5. #A\\n6. B#\\n7. B#\\n8. A#\\n9. #A\\n10. B#\\n11. B#\\n12. #A\\n13. B#\\n14. #A\\n15. #A\\n16. #A\\n17. A#\\n18. A#\\n19. B#\\n20. B#\\n\\nAfter replacing tokens3 and4 with nothing:\\n\\nThe sequence is:\\n\\n1. A#\\n2. A#\\n5. #A\\n6. B#\\n7. B#\\n8. A#\\n9. #A\\n10. B#\\n11. B#\\n12. #A\\n13. B#\\n14. #A\\n15. #A\\n16. #A\\n17. A#\\n18. A#\\n19. B#\\n20. B#\\n\\nBut after removing tokens3 and4, the next pair to check is token2 and token5.\\n\\nToken2: A# and token5: #A. They form A# #A, so according to rule1, this pair becomes nothing.\\n\\nSo we remove token2 and token5 as well.\\n\\nSo the sequence now is:\\n\\n1. A#\\n5. #A\\n6. B#\\n7. B#\\n8. A#\\n9. #A\\n10. B#\\n11. B#\\n12. #A\\n13. B#\\n14. #A\\n15. #A\\n16. #A\\n17. A#\\n18. A#\\n19. B#\\n20. B#\\n\\nSo token1 remains, then token5 through token20, since tokens2 and tokens3-4 removed.\\n\\nSo now, after first replacements, we have tokens 1,5,6,...20, with token1: A#, token5: #A, token6: B#, etc.\\n\\nNow, we need to check the new neighbors. After removing tokens2, the next pair is tokens1: A# and token5: #A. So checking them:\\n\\nA# followed by #A. So rule1 applies here as well. They become nothing.\\n\\nSo we remove token1 and token5 in addition.\\n\\nSo after this second replacement, tokens1, token5 are removed, so the sequence becomes:\\n\\n6. B#\\n7. B#\\n8. A#\\n9. #A\\n10. B#\\n11. B#\\n12. #A\\n13. B#\\n14. #A\\n15. #A\\n16. #A\\n17. A#\\n18. A#\\n19. B#\\n20. B#\\n\\nSo now, the sequence is from token6 to token20.\\n\\nSo far, we have operated three times: removing tokens3,4 then 2,5, then1,5.\\n\\nNow the current sequence:\\n\\nB#, B#, A#, #A, B#, B#, #A, B#, #A, #A, #A, A#, A#, B#, B#\\n\\nNow, let me note the structure:\\n\\n1. B#\\n2. B#\\n3. A#\\n4. #A\\n5. B#\\n6. B#\\n7. #A\\n8. B#\\n9. #A\\n10. #A\\n11. #A\\n12. A#\\n13. A#\\n14. B#\\n15. B#\\n\\nSo positions 6 to20 become 1-15 after removing the start.\\n\\nNow, scan from the start for applicable pairs:\\n\\nCheck token1 and token2:\\n\\nB# followed by B#: So according to rule4, B# #B becomes nothing.\\n\\nSo tokens1 and2: B# B# is replaced by nothing.\\n\\nSo the sequence becomes:\\n\\n3. A#\\n4. #A\\n5. B#\\n6. B#\\n7. #A\\n8. B#\\n9. #A\\n10. #A\\n11. #A\\n12. A#\\n13. A#\\n14. B#\\n15. B#\\n\\nNow, after removing tokens1 and2, the new sequence starts at token3.\\n\\nCheck token3: A# and token4: #A: A# #A, which is rule1, becomes nothing.\\n\\nSo tokens3 and4 are removed.\\n\\nSo sequence:\\n\\n5. B#\\n6. B#\\n7. #A\\n8. B#\\n9. #A\\n10. #A\\n11. #A\\n12. A#\\n13. A#\\n14. B#\\n15. B#\\n\\nNext, check token5 and6: B# B#, rule4: becomes nothing.\\n\\nSo tokens5 and6 are replaced by nothing.\\n\\nSequence becomes:\\n\\n7. #A\\n8. B#\\n9. #A\\n10. #A\\n11. #A\\n12. A#\\n13. A#\\n14. B#\\n15. B#\\n\\nNow, check token7 and8: #A and B#. What's the rule here? The rules require that the '#' faces each other. So the token7 is #A, and token8 is B#. They have A and B on the other ends, but since it's #A followed by B#, the '#' direction doesn't face each other. So no rule applies.\\n\\nMove to token8 and9: B# followed by #A. So that's B# #A, which matches rule3: B# #A becomes #A B#.\\n\\nSo tokens8 and9 are replaced by #A B#. So token9 becomes #A B#, but in the current sequence, there is no token after9 yet, except for #A in token7.\\n\\nLet me think:\\n\\nOriginal sequence after removing tokens5,6:\\n\\n7. #A\\n8. B#\\n9. #A\\n10. #A\\n11. #A\\n12. A#\\n13. A#\\n14. B#\\n15. B#\\n\\nAfter replacing tokens8 and9 (B# #A) with #A B#, the sequence becomes:\\n\\n7. #A\\n8. #A B# (now 2 tokens: #A and B#)\\n9. #A\\n10. #A\\n11. #A\\n12. A#\\n13. A#\\n14. B#\\n15. B#\\n\\nBut wait, each token is a single unit, right? So when we replace tokens8 and9 with #A B#, the sequence becomes:\\n\\nAfter replacement, the sequence is:\\n\\n7. #A\\n(new token after 7 is #A and then B#)\\n10. #A\\n11. #A\\n12. A#\\n13. A#\\n14. B#\\n15. B#\\n\\nWait, this seems a bit confusing. Let me correct this:\\n\\nWhen a rule is applied, two tokens are removed and replaced by the result, which can be one or two tokens.\\n\\nIn this case, rule3 is B# #A becomes #A B#. So it's two tokens being replaced by two tokens.\\n\\nSo after replacing token8 (B#) and token9 (#A) with #A and B#, the sequence becomes:\\n\\nToken7: #A\\n\\nToken new: #A (from replacement) and then B# (from replacement)\\n\\nThen token10: #A\\n\\nToken11: #A\\n\\nToken12: A#\\n\\nToken13:A#\\n\\nToken14:B#\\n\\nToken15:B#\\n\\nWait, perhaps I should model the sequence step by step.\\n\\nOriginal sequence after tokens5,6 are removed:\\n\\n7. #A\\n\\n8. B#\\n\\n9. #A\\n\\n10. #A\\n\\n11. #A\\n\\n12. A#\\n\\n13. A#\\n\\n14. B#\\n\\n15. B#\\n\\nSo indices 7 to15.\\n\\nWhen tokens8 and9 are applied (B# and #A), they become #A B#.\\n\\nSo the sequence is now:\\n\\nToken7: #A\\n\\nReplace tokens8-9 with two tokens: #A and B#\\n\\nThen continue with tokens10,11,12,...\\n\\nSo the new sequence is:\\n\\n7. #A\\n\\n8. #A\\n\\n9. B#\\n\\n10. #A\\n\\n11. #A\\n\\n12. A#\\n\\n13. A#\\n\\n14. B#\\n\\n15. B#\\n\\nSo now, there are 15 tokens, but token8,9 have added a B#, so let's double-check:\\n\\nWait, I think I'm getting confusing with the numbering.\\n\\nActually, when tokens8 and9 are replaced by two tokens: #A and B#, the entire sequence should be tokens7, new token8: #A, new token9: B#, then tokens10 to15 as before.\\n\\nWait, no: the replacement is for tokens8 and9; when you apply the rule, you remove tokens8 and9 and add the result. So in terms of the sequence, it's shortened.\\n\\nWait, perhaps a better approach is to replace tokens8 and9 with the result.\\n\\nLet me see:\\n\\nOriginal sequence:\\n\\npositions 7 to15:\\n\\n7. #A\\n\\n8. B#\\n\\n9. #A\\n\\n10. #A\\n\\n11. #A\\n\\n12. A#\\n\\n13. A#\\n\\n14. B#\\n\\n15. B#\\n\\nReplacing tokens8 and9 (B# and #A) with #A and B#:\\n\\nNow, the new sequence is:\\n\\npositions:\\n\\n7. #A\\n\\nnew token8: #A\\n\\nnew token9: B#\\n\\n10. #A\\n\\n11. #A\\n\\n12. A#\\n\\n13. A#\\n\\n14. B#\\n\\n15. B#\\n\\nSo that's 15 tokens, same as before, but now token9 is B#, different from previous.\\n\\nNow, check for new applicable pairs:\\n\\nStart from token7 and token8: #A followed by #A. No rule applies.\\n\\nToken8 (new #A) and token9 (B#): #A followed by B#. Since both have A and B, but the # directions are the same (both A in #A and B in B#), so it's actually opposite?\\n\\nWait, #A means A is on the right, but direction is the other way. Maybe I need to clarify.\\n\\nWait, actually, in terms of direction, the tokens are either like #A or A#. So when we have #A followed by B#, it's like 'A' facing left and 'B' facing right.\\n\\nThe rules apply when two neighboring tokens have '#' facing each other. So:\\n\\n- A# followed by #A (right facing A then left facing A)\\n- A# followed by #B (right facing A then left facing B)\\n- B# followed by #A (right facing B then left facing A)\\n- B# followed by #B (right facing B then left facing B)\\n\\nAh, so perhaps the key is that the '#' of one token meets the other token's face.\\n\\nSo when two tokens are adjacent, if the right face of the first is matching the left face of the second in terms of something.\\n\\nWait, maybe it's better to think that:\\n\\nEach token is either starting with # or not.\\n\\nFor example, A# has # on the left, A on the right.\\n\\nThe neighboring token to the right could be something that has either # or not on the left.\\n\\nSo A# followed by #A would mean that the right face of A# is A, and the left face of #A is #. So A and # don't match.\\n\\nWait, perhaps my earlier understanding was incorrect.\\n\\nWait, perhaps it's the other way: when a token has a # on its left or right, and the next token has a # on its left or right.\\n\\nFor example, A# has A on the right, # on the left.\\n\\nNext token: if it's #A, then it's # on left, A on right.\\n\\nSo when you have A# and #A, the right face of the first is A, and the left face of the second is #. Since they are different (A vs #), does that count as facing each other? Or is it faces are A and A?\\n\\nWait, I might need to re-examine the rules.\\n\\nThe rules are:\\n\\nA# #A becomes nothing\\n\\nA# #B becomes #B A#\\n\\nB# #A becomes #A B#\\n\\nB# #B becomes nothing\\n\\nSo the rules are when you have A# followed by #A, they are replaced by nothing.\\n\\nA# followed by #B: becomes #B A#.\\n\\nSimilarly, B# followed by #A becomes #A B#.\\n\\nAnd B# followed by #B becomes nothing.\\n\\nSo the key is that when two tokens have a face of A or B, and their # faces are facing each other.\\n\\nMeaning, for rule1: A# followed by #A - so the first token has A on the right, second has # on the left, which is facing the right. Hence, they are facing each other.\\n\\nSimilarly, for rule3: B# followed by #A.\\n\\nSo it's about the directions and the content.\\n\\nSo, in other words, for two adjacent tokens:\\n\\nIf the first token is A# and the second is #A, they cancel each other.\\n\\nIf first is A# and second is #B, they transform into #B A#.\\n\\nIf first is B# and second is #A, they transform into #A B#.\\n\\nIf first is B# and second is #B, they cancel.\\n\\nSo, regardless of the content (A or B), as long as the first token has something on its right and the second has # on its left, or vice versa.\\n\\nSo with that clarified, let's go back.\\n\\nAfter the replacement, we have:\\n\\nToken7: #A\\n\\nToken8: #A\\n\\nToken9: B#\\n\\nToken10: #A\\n\\nToken11: #A\\n\\nToken12: A#\\n\\nToken13: A#\\n\\nToken14: B#\\n\\nToken15: B#\\n\\nSo the sequence is:\\n\\n#A, #A, B#, #A, #A, A#, A#, B#, B#\\n\\nNow, let's check for new applicable pairs.\\n\\nStart with token1 and token2: both #A. Not applicable.\\n\\nToken2: #A, token3: B#. Is #A followed by B#, which is not a facing pair because the second is B on the right, not #.\\n\\nRule applicable only when two tokens are A# #A, A# #B, B# #A, B# #B.\\n\\nWait, token2 is #A, token3 is B#. But token3 is B#, not #something. So their pairing is #A followed by B#. This is different because the second token is not starting with #. So it might not form a rule. Wait, actually, let's see.\\n\\nWait, the rule A# #B means that first token is A# and the second is #B. So in our case, token3 is B#, which is the reverse.\\n\\nSo token2 is #A, token3 is B#. So that's #A followed by B#. Is that a rule? Let me see:\\n\\nLooking at the four rules:\\n\\n1. A# #A (A followed by A) -> nothing.\\n\\n2. A# #B (A followed by B) -> #B A#.\\n\\n3. B# #A (B followed by A) -> #A B#.\\n\\n4. B# #B (B followed by B) -> nothing.\\n\\nSo, we have #A followed by B#. The first is #A (starts with #), the second is B#. But for rule2, it's A# followed by #B. So different.\\n\\nSo #A followed by B# is B# followed by #A, which is the reverse of rule3.\\n\\nSince rule3 is B# followed by #A becomes #A B#. So the application of rule3 requires left token is B#, right token is #A.\\n\\nIn our case, token2 is #A, token3 is B#. So the left is #A, the right is B#, so not applicable for any rule.\\n\\nThus, no replacement happens between token2 and3.\\n\\nSimilarly, token3 is B#, token4 is #A. So B# followed by #A, which is rule3's condition. So yes! B# followed by #A: rule3.\\n\\nSo, applying rule3: B# followed by #A becomes #A B#.\\n\\nTherefore, tokens3 and4 are replaced by #A and B#.\\n\\nSo the sequence, after removal of tokens3 and4:\\n\\nThe sequence so far is:\\n\\nToken1: #A\\n\\nToken2: #A\\n\\nToken5: B#\\n\\nToken6: #A\\n\\nToken7: #A\\n\\nToken8: A#\\n\\nToken9: A#\\n\\nToken10: B#\\n\\nToken11: B#\\n\\nSo replacing token3 and4 (B#, #A) with #A and B#.\\n\\nSo the sequence becomes:\\n\\n#A, #A, #A, B#, #A, #A, A#, A#, B#, B#\\n\\nWait, perhaps a better way:\\n\\nThe original sequence before replacement:\\n\\n1. #A\\n\\n2. #A\\n\\n3. B#\\n\\n4. #A\\n\\n5. #A\\n\\n6. A#\\n\\n7. A#\\n\\n8. B#\\n\\n9. B#\\n\\nAfter replacing tokens3 and4 (B#, #A) with #A and B#, the sequence is:\\n\\n1. #A\\n\\n2. #A\\n\\n(new token3: #A\\n\\nnew token4: B#)\\n\\n5. #A\\n\\n6. A#\\n\\n7. A#\\n\\n8. B#\\n\\n9. B#\\n\\nSo the sequence becomes:\\n\\n#A, #A, #A, B#, #A, A#, A#, B#, B#\\n\\nWait, let's recount:\\n\\nAfter replacement, positions:\\n\\nOriginal: 1,2,3,4,5,6,7,8,9\\n\\nAfter removal of 3,4, replacing them with #A and B#, the sequence is:\\n\\n1,2, new3( #A ), new4 (B# ), 5,6,7,8,9\\n\\nSo:\\n\\n1: #A\\n\\n2: #A\\n\\n3: #A\\n\\n4: B#\\n\\n5: #A\\n\\n6: A#\\n\\n7: A#\\n\\n8: B#\\n\\n9: B#\\n\\nSo the new sequence is:\\n\\nPositions: 1,2,3,4,5,6,7,8,9 are:\\n\\n#A, #A, #A, B#, #A, A#, A#, B#, B#\\n\\nNow, we need to check for new rules between the new tokens.\\n\\nNow, the sequence is: #A, #A, #A, B#, #A, A#, A#, B#, B#\\n\\nLooking for pairs where a token is A# or B# followed by #A or #B.\\n\\nCheck token1 and2: #A followed by #A: same, not matching any rule.\\n\\nToken2 and3: #A followed by #A: Same.\\n\\nToken3 and4: #A followed by B#.\\n\\nThis is #A followed by B#. Is this a rule? Let's see.\\n\\nRule2 is A# followed by #B becomes #B A#. So, it's A# followed by #B.\\n\\nHere, we have #A followed by B#. That would be B# followed by #A, which is the reverse of rule3.\\n\\nSince rule3 is applicable when B# is followed by #A, which becomes #A B#. So yes, in this case, #A followed by B# is the reverse.\\n\\nDoes the transformation go both ways? The rules as stated are unidirectional:\\n\\n- A# #A becomes nothing.\\n\\n- A# #B becomes #B A#.\\n\\n- B# #A becomes #A B#.\\n\\n- B# #B becomes nothing.\\n\\nSo unless there's a reverse rule, as given, only the listed transformations happen.\\n\\nSo, token3 is #A, token4 is B#. So it's #A followed by B#. That's B# (in token4) is not preceded by anything; token3 is #A. It doesn't match any left-hand side of the rules. So no transformation happens.\\n\\nMoving on to token4 and5: B# followed by #A. That's exactly rule3: B# #A becomes #A B#. So tokens4 and5 are replaced by #A and B#.\\n\\nTherefore, tokens4 and5 are replaced, so the sequence becomes:\\n\\n1: #A\\n\\n2: #A\\n\\n3: #A\\n\\n(new token5: #A)\\n\\n(new token6: B#)\\n\\n7: A#\\n\\n8: A#\\n\\n9: B#\\n\\n10: B#\\n\\nSo the sequence is now:\\n\\nTokens:\\n\\n1: #A\\n\\n2: #A\\n\\n3: #A\\n\\n5: #A\\n\\n6: B#\\n\\n7: A#\\n\\n8: A#\\n\\n9: B#\\n\\n10: B#\\n\\nWait, no. After replacing tokens4 and5 (B#, #A) with #A and B#, the sequence is:\\n\\nOriginal tokens4 and5 removed, so tokens left after removal:\\n\\n1,2,3, new token4 and5, token6,7,8,9,10, etc.\\n\\nWait, it's getting a bit tangled. Perhaps a different approach:\\n\\nOriginal after replacing tokens3,4:\\n\\n#A, #A, #A, B#, #A, A#, A#, B#, B#\\n\\nSo positions 1-9:\\n\\n1: #A\\n\\n2: #A\\n\\n3: #A\\n\\n4: B#\\n\\n5: #A\\n\\n6: A#\\n\\n7: A#\\n\\n8: B#\\n\\n9: B#\\n\\nNow, checking for applicable pairs. Next pair after tokens3 is token4 and5: B# and #A.\\n\\nB# followed by #A is exactly rule3: replaced by #A and B#.\\n\\nTherefore, removing tokens4 and5 (B# and #A), and adding #A and B#.\\n\\nSo new sequence:\\n\\n1: #A\\n\\n2: #A\\n\\n3: #A\\n\\n#A (from replacement at token5)\\n\\nB# (from replacement at token6)\\n\\n7: A#\\n\\n8: A#\\n\\n9: B#\\n\\n10: B#\\n\\nSo the sequence now is:\\n\\npositions:\\n\\n1: #A\\n\\n2: #A\\n\\n3: #A\\n\\n5: #A\\n\\n6: B#\\n\\n7: A#\\n\\n8: A#\\n\\n9: B#\\n\\n10: B#\\n\\nWait, perhaps we're duplicating indices. Maybe better to think that after replacing tokens4 and5:\\n\\nOriginal tokens:\\n\\n1,2,3: #A, #A, #A\\n\\n4,5: B#, #A\\n\\n6,7,8,9: A#, A#, B#, B#\\n\\nAfter replacement:\\n\\nRemove tokens4 and5, replace with #A and B#.\\n\\nSo the new sequence is:\\n\\ntokens1,2,3, then #A, B#, then 6,7,8,9:\\n\\nSo:\\n\\n1: #A\\n\\n2: #A\\n\\n3: #A\\n\\n5: #A\\n\\n6: B#\\n\\n7: A#\\n\\n8: A#\\n\\n9: B#\\n\\n10: B#\\n\\nSo now, the sequence is length 10.\\n\\nCheck for next transformations:\\n\\nToken5 is #A, token6 is B#. So #A followed by B#, which isn't a rule.\\n\\nToken6 is B#, token7 is A#: B# followed by A#, which isn't a rule.\\n\\nToken7 is A#, token8 is A#: not a rule.\\n\\nToken8 is A#, token9 is B#: not a rule.\\n\\nToken9 and10: B# followed by B#: rule4, they become nothing.\\n\\nSo tokens9 and10 are B# B#, which becomes nothing.\\n\\nSo remove tokens9 and10:\\n\\nSequence becomes:\\n\\n1: #A\\n\\n2: #A\\n\\n3: #A\\n\\n5: #A\\n\\n6: B#\\n\\n7: A#\\n\\n8: A#\\n\\nSo now sequence is:\\n\\n#A, #A, #A, #A, B#, A#, A#\\n\\nCheck for applicable rules.\\n\\nCheck token1 and2: #A followed by #A: same direction, no rule.\\n\\nToken2 and3: same.\\n\\nToken3 and4: same.\\n\\nToken4 and5: #A followed by B#. Not a rule.\\n\\nToken5 is B#, token6 is A#: B# followed by A#, which is not rule3 because rule3 is B# followed by #A.\\n\\nSimilarly, token6 is A#, token7 is A#: same, no rule.\\n\\nTherefore, no more replacements possible.\\n\\nSo after all these operations, the final state of the program is:\\n\\n#A, #A, #A, #A, B#, A#, A#\\n\\nSo translating each token:\\n\\n#A is A#, right? Wait, let's remember that the tokens are:\\n\\n- A# is A followed by #\\n- #A is # followed by A\\n\\nSo in the final sequence:\\n\\n1. #A: becomes A#\\n2. #A: becomes A#\\n3. #A: becomes A#\\n4. #A: becomes A#\\n5. B#: becomes B#\\n6. A#: becomes A#\\n7. A#: becomes A#\\n\\nSo the final sequence is:\\n\\n1. A#\\n2. A#\\n3. A#\\n4. A#\\n5. B#\\n6. A#\\n7. A#\\n\\nThus, when we write it out:\\n\\nA# A# A# A# B# A# A#\\n\\nSo that's seven tokens.\\n\\nIs that the final state? Let me make sure.\\n\\nWait, is there any chance of further transformations?\\n\\nLooking at the final sequence: A#, A#, A#, A#, B#, A#, A#\\n\\nCheck for any adjacent pairs:\\n\\nEach A# followed by A# or B# or A#.\\n\\nSo:\\n\\n1-2: A# A# - no rule.\\n\\n2-3: same.\\n\\n3-4: same.\\n\\n4-5: A# followed by B#. That is, token4 is A#, token5 is B#. So is there a rule? Looking back:\\n\\nRule2: A# followed by #B becomes #B A#.\\n\\nHere, A# followed by B# is not the required. The rule expects A# followed by #B, with the # in the middle.\\n\\nHere, it's A# followed by B#, which is different.\\n\\nSimilarly, token5 is B#, token6 is A#: which is B# followed by A#, but the rule is B# followed by #A.\\n\\nSo, not applicable.\\n\\nTherefore, no more rules can apply.\\n\\nSo the final program is A# A# A# A# B# A# A#.\\n\\nBut let me make sure I didn't miss any possible pairs that I could have transformed.\\n\\nWait, is there a way A# followed by B# can be transformed? Or is that not covered by any rule?\\n\\nLooking at the rules:\\n\\nA# followed by #A becomes nothing: so A# and then empty A#.\\n\\nA# followed by #B becomes #B A#: meaning turning A# #B into #B A#.\\n\\nB# followed by #A becomes #A B#.\\n\\nB# followed by #B becomes nothing.\\n\\nTherefore, A# followed by B# is just A# B#, no replacement.\\n\\nSimilarly, B# followed by A# is B# A#.\\n\\nSo in the final sequence, these are not covered.\\n\\nSo yes, the program stops here.\\n\\nTherefore, the final state is A# A# A# A# B# A# A#\\n\\nWhich in terms of the tokens:\\n\\nA#, A#, A#, A#, B#, A#, A#\\n\\nSo writing this as a string separated by spaces:\\n\\nA# A# A# A# B# A# A#\\n\\nWhich in the original token notation is:\\n\\nA# A# A# A# B# A# A#\\n\\nThus, the final program is composed of these tokens.\\n\\nTherefore, the answer is the sequence above. But let me write it in the box as per instruction.\\n\\n**Final Answer**\\n\\nThe final state of the program is \\\\boxed{A\\\\# A\\\\# A\\\\# A\\\\# B\\\\# A\\\\# A\\\\#}.\\n</think>\\n\\nThe given A::B program is a sequence of tokens: `A# A# A# #A #A B# B# A# #A B# B# #A B# #A #A #A A# A# B# B#`. We need to apply the rewrite rules to compute the final state of the program.\\n\\nThe rewrite rules are:\\n1. \\\\( A\\\\# \\\\#A \\\\) becomes nothing.\\n2. \\\\( A\\\\# \\\\#B \\\\) becomes \\\\( \\\\#B A\\\\# \\\\).\\n3. \\\\( B\\\\# \\\\#A \\\\) becomes \\\\( \\\\#A B\\\\# \\\\).\\n4. \\\\( B\\\\# \\\\#B \\\\) becomes nothing.\\n\\nWe start by scanning the sequence from left to right and applying the rules whenever two neighboring tokens match the required patterns.\\n\\n1. Removing \\\\( A\\\\# \\\\#A \\\\) pairs and simplifying the sequence.\\n2. Repeating the process until no more applicable rules can be applied.\\n\\nAfter applying the rules step by step, the final state of the program is determined to be:\\n\\n\\\\[ A\\\\# A\\\\# A\\\\# A\\\\# B\\\\# A\\\\# A\\\\# \\\\]\\n\\nThus, the final state of the program is:\\n\\n\\\\[\\n\\\\boxed{A\\\\# A\\\\# A\\\\# A\\\\# B\\\\# A\\\\# A\\\\#}\\n\\\\]\",\n",
       " 'ground_truth': '#A #A #A B# B# B# B# B# A# A# B# B#',\n",
       " 'score': {'score': 0.0, 'soft_format': 0.25, 'hard_format': 0.0},\n",
       " 'difficulty': None,\n",
       " 'reason': \"score: {'score': 0.0, 'soft_format': 0.25, 'hard_format': 0.0}\",\n",
       " 'final_reward': 0,\n",
       " 'uuid': '3c82337a-7f27-4270-872a-da3a8c5ef633'}"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "86a5d38d",
   "metadata": {},
   "outputs": [],
   "source": [
    "import reasoning_gym\n",
    "from reasoning_gym.factory import create_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "d78d9482",
   "metadata": {},
   "outputs": [],
   "source": [
    "puzzles = {\n",
    "    \"ab\": {\"length\": 20},\n",
    "    \"advanced_geometry\": {\"min_coord\": -20, \"max_coord\": 20},\n",
    "    \"knight_swap\": {},\n",
    "    \"tsumego\": {\"min_board_size\": 15, \"max_board_size\": 19, \"max_stones\": 20},\n",
    "    \"fraction_simplification\": {},\n",
    "    \"propositional_logic\": {},\n",
    "    \"bitwise_arithmetic\": {\"difficulty\": 4},\n",
    "    \"letter_counting\": {\"min_words\": 25, \"max_words\": 35},\n",
    "    \"maze\": {\"min_dist\": 15, \"max_dist\": 25, \"min_grid_size\": 15, \"max_grid_size\": 25},\n",
    "    \"puzzle24\": {\"min_value\": 8, \"max_value\": 10},\n",
    "    \"quantum_lock\": {\"difficulty\": 25},\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "53007cb8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2749/2749 [00:27<00:00, 99.42it/s] \n"
     ]
    }
   ],
   "source": [
    "correctness_response_length_list = {}\n",
    "from tqdm import tqdm\n",
    "for d in tqdm(data):\n",
    "    generated_answer = extract_boxed_answer(d['response'])\n",
    "    for i in range(len(reference_dataset)):\n",
    "        if reference_dataset[i]['uuid'] == d['uuid']:\n",
    "            gold_entry = reference_dataset[i]\n",
    "            puzzle = gold_entry['puzzle']\n",
    "            metadata = gold_entry['metadata']\n",
    "            dataset = reasoning_gym.create_dataset(puzzle, size=50, **puzzles[puzzle])\n",
    "            break\n",
    "    if gold_entry is not None:\n",
    "        score = dataset.score_answer(answer=generated_answer, entry=gold_entry)\n",
    "        if score < 1:\n",
    "            score = 0\n",
    "    if d['data_source'] not in correctness_response_length_list:\n",
    "        correctness_response_length_list[d['data_source']] = []\n",
    "    correctness_response_length_list[d['data_source']].append((score, len(tokenizer.encode(d['response']))))\n",
    "\n",
    "\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "450d4548",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_aucoaa(data):\n",
    "    # data is a list of (correctness, response_length) tuples\n",
    "    n = len(data)\n",
    "    tmax = max(length for _, length in data)\n",
    "    oaa_t_list = []\n",
    "    for t in range(tmax + 1):\n",
    "        filtered = [correct for correct, length in data if length < t]\n",
    "        if filtered:\n",
    "            oaa_t = sum(filtered) / n  # Denominator is always n, per definition\n",
    "        else:\n",
    "            oaa_t = 0.0\n",
    "        oaa_t_list.append(oaa_t)\n",
    "    aucoaa = sum(oaa_t_list) / len(oaa_t_list)\n",
    "    return aucoaa\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "5c77470a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Data Source:  underthink-bench\n",
      "Mean Response Length:  5918.535103674063\n",
      "Accuracy:  0.22299017824663514\n",
      "Number of Samples:  2749\n",
      "AUC-OAA:  0.1497214831600845\n",
      "--------------------------------\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "for key in correctness_response_length_list.keys():\n",
    "    mean_response_length = np.mean([length for _, length in correctness_response_length_list[key]])\n",
    "    accuracy = np.mean([correct for correct, _ in correctness_response_length_list[key]])\n",
    "    print(\"Data Source: \", key)\n",
    "    print(\"Mean Response Length: \", mean_response_length)\n",
    "    print(\"Accuracy: \", accuracy)\n",
    "    print(\"Number of Samples: \", len(correctness_response_length_list[key]))\n",
    "    print(\"AUC-OAA: \", compute_aucoaa(correctness_response_length_list[key]))\n",
    "    print(\"--------------------------------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "475f361d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
