{
  "timestamp": "2025-09-14T01:04:50.837550",
  "benchmark_size": 211,
  "prompt_type": "direct_answer",
  "models_evaluated": [
    "huggingface/Qwen/Qwen3-0.6B",
    "huggingface/Qwen/Qwen3-4B",
    "huggingface/Qwen/Qwen3-8B"
  ],
  "results_summary": {
    "huggingface/Qwen/Qwen3-0.6B": {
      "accuracy": 0.014218009478672985,
      "avg_response_time": 0.44385605852750804,
      "results_by_operation": {
        "complex": {
          "total": 1,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 2.8797731399536133
        },
        "trigonometry": {
          "total": 10,
          "correct": 2,
          "accuracy": 0.2,
          "avg_response_time": 0.21155338287353515
        },
        "subtraction": {
          "total": 40,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 0.622601968050003
        },
        "division": {
          "total": 25,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 0.33790852546691896
        },
        "logarithm": {
          "total": 25,
          "correct": 1,
          "accuracy": 0.04,
          "avg_response_time": 0.2916263675689697
        },
        "exponentiation": {
          "total": 25,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 0.23500329971313477
        },
        "multiplication": {
          "total": 25,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 0.328193826675415
        },
        "addition": {
          "total": 60,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 0.5655990362167358
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 0.2232819652557373
        },
        "hard": {
          "total": 86,
          "correct": 3,
          "accuracy": 0.03488372093023256,
          "avg_response_time": 0.5558536718058031
        },
        "medium": {
          "total": 100,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 0.40268163442611693
        }
      },
      "metadata": {
        "prompt_type": "direct_answer",
        "prompt_description": "Direct numerical answer only"
      }
    },
    "huggingface/Qwen/Qwen3-4B": {
      "accuracy": 0.8720379146919431,
      "avg_response_time": 0.1954419669381815,
      "results_by_operation": {
        "complex": {
          "total": 1,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 0.22928667068481445
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 0.07816793918609619
        },
        "subtraction": {
          "total": 40,
          "correct": 29,
          "accuracy": 0.725,
          "avg_response_time": 0.3203514516353607
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.10205718994140625
        },
        "logarithm": {
          "total": 25,
          "correct": 22,
          "accuracy": 0.88,
          "avg_response_time": 0.09936222076416015
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.13797806739807128
        },
        "multiplication": {
          "total": 25,
          "correct": 21,
          "accuracy": 0.84,
          "avg_response_time": 0.1721661853790283
        },
        "addition": {
          "total": 60,
          "correct": 52,
          "accuracy": 0.8666666666666667,
          "avg_response_time": 0.24373565514882406
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 24,
          "accuracy": 0.96,
          "avg_response_time": 0.09258334159851074
        },
        "hard": {
          "total": 86,
          "correct": 67,
          "accuracy": 0.7790697674418605,
          "avg_response_time": 0.25798707784608355
        },
        "medium": {
          "total": 100,
          "correct": 93,
          "accuracy": 0.93,
          "avg_response_time": 0.16736782789230348
        }
      },
      "metadata": {
        "prompt_type": "direct_answer",
        "prompt_description": "Direct numerical answer only"
      }
    },
    "huggingface/Qwen/Qwen3-8B": {
      "accuracy": 0.957345971563981,
      "avg_response_time": 0.17883439651597732,
      "results_by_operation": {
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 0.09308290481567383
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 0.08193988800048828
        },
        "subtraction": {
          "total": 40,
          "correct": 37,
          "accuracy": 0.925,
          "avg_response_time": 0.22352556586265565
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.10483613014221191
        },
        "logarithm": {
          "total": 25,
          "correct": 24,
          "accuracy": 0.96,
          "avg_response_time": 0.09825555801391601
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.1427753448486328
        },
        "multiplication": {
          "total": 25,
          "correct": 24,
          "accuracy": 0.96,
          "avg_response_time": 0.16860090255737303
        },
        "addition": {
          "total": 60,
          "correct": 56,
          "accuracy": 0.9333333333333333,
          "avg_response_time": 0.2503142476081848
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.08565625190734863
        },
        "hard": {
          "total": 86,
          "correct": 78,
          "accuracy": 0.9069767441860465,
          "avg_response_time": 0.22874671082163966
        },
        "medium": {
          "total": 100,
          "correct": 99,
          "accuracy": 0.99,
          "avg_response_time": 0.1592043423652649
        }
      },
      "metadata": {
        "prompt_type": "direct_answer",
        "prompt_description": "Direct numerical answer only"
      }
    }
  }
}