{
  "timestamp": "2025-09-14T01:56:39.290797",
  "benchmark_size": 211,
  "prompt_type": "step_by_step_boxed",
  "models_evaluated": [
    "huggingface/Qwen/Qwen3-0.6B",
    "huggingface/Qwen/Qwen3-4B",
    "huggingface/Qwen/Qwen3-8B"
  ],
  "results_summary": {
    "huggingface/Qwen/Qwen3-0.6B": {
      "accuracy": 0.8578199052132701,
      "avg_response_time": 2.4135433305496288,
      "results_by_operation": {
        "multiplication": {
          "total": 25,
          "correct": 24,
          "accuracy": 0.96,
          "avg_response_time": 3.297241973876953
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.288179292678833
        },
        "logarithm": {
          "total": 25,
          "correct": 23,
          "accuracy": 0.92,
          "avg_response_time": 2.5758793544769287
        },
        "subtraction": {
          "total": 40,
          "correct": 31,
          "accuracy": 0.775,
          "avg_response_time": 2.572245740890503
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 1.733338212966919
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 4.80588698387146
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 1.6647454977035523
        },
        "addition": {
          "total": 60,
          "correct": 42,
          "accuracy": 0.7,
          "avg_response_time": 2.2924750049908957
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 1.4475726413726806
        },
        "hard": {
          "total": 86,
          "correct": 66,
          "accuracy": 0.7674418604651163,
          "avg_response_time": 2.6843185203019964
        },
        "medium": {
          "total": 100,
          "correct": 90,
          "accuracy": 0.9,
          "avg_response_time": 2.42216933965683
        }
      },
      "metadata": {
        "prompt_type": "step_by_step_boxed",
        "prompt_description": "Step-by-step reasoning with boxed final answer"
      }
    },
    "huggingface/Qwen/Qwen3-4B": {
      "accuracy": 0.9620853080568721,
      "avg_response_time": 5.946729312010851,
      "results_by_operation": {
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 5.988243675231933
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 4.779955806732178
        },
        "logarithm": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 6.08839672088623
        },
        "subtraction": {
          "total": 40,
          "correct": 38,
          "accuracy": 0.95,
          "avg_response_time": 6.02228684425354
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 4.6386988067626955
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 7.3401172161102295
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 6.609384536743164
        },
        "addition": {
          "total": 60,
          "correct": 54,
          "accuracy": 0.9,
          "avg_response_time": 6.717534554004669
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.3574006080627443
        },
        "hard": {
          "total": 86,
          "correct": 78,
          "accuracy": 0.9069767441860465,
          "avg_response_time": 7.925383723059366
        },
        "medium": {
          "total": 100,
          "correct": 100,
          "accuracy": 1.0,
          "avg_response_time": 5.142418694496155
        }
      },
      "metadata": {
        "prompt_type": "step_by_step_boxed",
        "prompt_description": "Step-by-step reasoning with boxed final answer"
      }
    },
    "huggingface/Qwen/Qwen3-8B": {
      "accuracy": 0.966824644549763,
      "avg_response_time": 7.163264580812499,
      "results_by_operation": {
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 6.798269786834717
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 5.107650232315064
        },
        "logarithm": {
          "total": 25,
          "correct": 24,
          "accuracy": 0.96,
          "avg_response_time": 6.585007305145264
        },
        "subtraction": {
          "total": 40,
          "correct": 38,
          "accuracy": 0.95,
          "avg_response_time": 8.440913480520248
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 6.490916795730591
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 7.334251642227173
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 7.069442296028138
        },
        "addition": {
          "total": 60,
          "correct": 56,
          "accuracy": 0.9333333333333333,
          "avg_response_time": 7.8539584954579675
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.7977223014831543
        },
        "hard": {
          "total": 86,
          "correct": 79,
          "accuracy": 0.9186046511627907,
          "avg_response_time": 9.444906528605971
        },
        "medium": {
          "total": 100,
          "correct": 100,
          "accuracy": 1.0,
          "avg_response_time": 6.29243807554245
        }
      },
      "metadata": {
        "prompt_type": "step_by_step_boxed",
        "prompt_description": "Step-by-step reasoning with boxed final answer"
      }
    }
  }
}