{
  "timestamp": "2025-09-14T01:33:21.611868",
  "benchmark_size": 211,
  "prompt_type": "direct_answer",
  "models_evaluated": [
    "openai/gpt-4o-mini",
    "anthropic/claude-3-5-haiku-20241022",
    "together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput",
    "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
  ],
  "results_summary": {
    "openai/gpt-4o-mini": {
      "accuracy": 0.909952606635071,
      "avg_response_time": 0.4057103785293362,
      "results_by_operation": {
        "subtraction": {
          "total": 40,
          "correct": 37,
          "accuracy": 0.925,
          "avg_response_time": 0.396700394153595
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 0.3530494928359985
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 0.5091910362243652
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.37620265007019044
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.43011383056640623
        },
        "multiplication": {
          "total": 25,
          "correct": 24,
          "accuracy": 0.96,
          "avg_response_time": 0.43422277450561525
        },
        "addition": {
          "total": 60,
          "correct": 49,
          "accuracy": 0.8166666666666667,
          "avg_response_time": 0.40622021357218424
        },
        "logarithm": {
          "total": 25,
          "correct": 21,
          "accuracy": 0.84,
          "avg_response_time": 0.4124197578430176
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.37621575355529785
        },
        "hard": {
          "total": 86,
          "correct": 68,
          "accuracy": 0.7906976744186046,
          "avg_response_time": 0.3949548006057739
        },
        "medium": {
          "total": 100,
          "correct": 99,
          "accuracy": 0.99,
          "avg_response_time": 0.42233383178710937
        }
      },
      "metadata": {
        "prompt_type": "direct_answer",
        "prompt_description": "Direct numerical answer only"
      }
    },
    "anthropic/claude-3-5-haiku-20241022": {
      "accuracy": 0.990521327014218,
      "avg_response_time": 0.604168433148714,
      "results_by_operation": {
        "subtraction": {
          "total": 40,
          "correct": 40,
          "accuracy": 1.0,
          "avg_response_time": 0.5550179123878479
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 0.5996559858322144
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 0.6133341789245605
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.5387032318115235
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.5783308982849121
        },
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.5647346591949463
        },
        "addition": {
          "total": 60,
          "correct": 58,
          "accuracy": 0.9666666666666667,
          "avg_response_time": 0.631926667690277
        },
        "logarithm": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.7483643627166748
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.5605968856811523
        },
        "hard": {
          "total": 86,
          "correct": 85,
          "accuracy": 0.9883720930232558,
          "avg_response_time": 0.6528528901033623
        },
        "medium": {
          "total": 100,
          "correct": 99,
          "accuracy": 0.99,
          "avg_response_time": 0.5731926870346069
        }
      },
      "metadata": {
        "prompt_type": "direct_answer",
        "prompt_description": "Direct numerical answer only"
      }
    },
    "together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput": {
      "accuracy": 0.995260663507109,
      "avg_response_time": 0.3003173021343647,
      "results_by_operation": {
        "subtraction": {
          "total": 40,
          "correct": 40,
          "accuracy": 1.0,
          "avg_response_time": 0.2565545797348022
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 0.15566110610961914
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 0.3343799114227295
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.23257992744445802
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.3229698085784912
        },
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.23509495735168456
        },
        "addition": {
          "total": 60,
          "correct": 59,
          "accuracy": 0.9833333333333333,
          "avg_response_time": 0.45812358856201174
        },
        "logarithm": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.15840975761413575
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.5238600826263428
        },
        "hard": {
          "total": 86,
          "correct": 85,
          "accuracy": 0.9883720930232558,
          "avg_response_time": 0.29021699206773627
        },
        "medium": {
          "total": 100,
          "correct": 100,
          "accuracy": 1.0,
          "avg_response_time": 0.25311787366867067
        }
      },
      "metadata": {
        "prompt_type": "direct_answer",
        "prompt_description": "Direct numerical answer only"
      }
    },
    "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": {
      "accuracy": 0.990521327014218,
      "avg_response_time": 0.13956675597276733,
      "results_by_operation": {
        "subtraction": {
          "total": 40,
          "correct": 39,
          "accuracy": 0.975,
          "avg_response_time": 0.14488280415534974
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 0.11342787742614746
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 0.1699690818786621
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.12230437278747558
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.13455548286437988
        },
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.12445989608764649
        },
        "addition": {
          "total": 60,
          "correct": 59,
          "accuracy": 0.9833333333333333,
          "avg_response_time": 0.15878893931706747
        },
        "logarithm": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.13154781341552735
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 0.12492811203002929
        },
        "hard": {
          "total": 86,
          "correct": 84,
          "accuracy": 0.9767441860465116,
          "avg_response_time": 0.1392952896827875
        },
        "medium": {
          "total": 100,
          "correct": 100,
          "accuracy": 1.0,
          "avg_response_time": 0.14345987796783446
        }
      },
      "metadata": {
        "prompt_type": "direct_answer",
        "prompt_description": "Direct numerical answer only"
      }
    }
  }
}