{
  "timestamp": "2025-09-14T02:26:53.354090",
  "benchmark_size": 211,
  "prompt_type": "step_by_step_boxed",
  "models_evaluated": [
    "openai/gpt-4o-mini",
    "anthropic/claude-3-5-haiku-20241022",
    "together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput",
    "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
  ],
  "results_summary": {
    "openai/gpt-4o-mini": {
      "accuracy": 0.9241706161137441,
      "avg_response_time": 3.499606694090423,
      "results_by_operation": {
        "subtraction": {
          "total": 40,
          "correct": 38,
          "accuracy": 0.95,
          "avg_response_time": 3.679339575767517
        },
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 3.5669068336486816
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 3.4703338146209717
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 4.482011489868164
        },
        "addition": {
          "total": 60,
          "correct": 49,
          "accuracy": 0.8166666666666667,
          "avg_response_time": 3.6166720469792684
        },
        "logarithm": {
          "total": 25,
          "correct": 22,
          "accuracy": 0.88,
          "avg_response_time": 3.0774894523620606
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 3.024138402938843
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.4948477745056152
        }
      },
      "results_by_difficulty": {
        "hard": {
          "total": 86,
          "correct": 70,
          "accuracy": 0.813953488372093,
          "avg_response_time": 3.7727210937544355
        },
        "medium": {
          "total": 100,
          "correct": 100,
          "accuracy": 1.0,
          "avg_response_time": 3.8043894815444945
        },
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 1.3409620094299317
        }
      },
      "metadata": {
        "prompt_type": "step_by_step_boxed",
        "prompt_description": "Step-by-step reasoning with boxed final answer"
      }
    },
    "anthropic/claude-3-5-haiku-20241022": {
      "accuracy": 0.995260663507109,
      "avg_response_time": 2.551776169600645,
      "results_by_operation": {
        "subtraction": {
          "total": 40,
          "correct": 40,
          "accuracy": 1.0,
          "avg_response_time": 2.5546135127544405
        },
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.8580487155914307
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 1.9238872528076172
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.9844112491607664
        },
        "addition": {
          "total": 60,
          "correct": 59,
          "accuracy": 0.9833333333333333,
          "avg_response_time": 2.6817786931991576
        },
        "logarithm": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.4818278694152833
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 2.4419636487960816
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 1.6353116035461426
        }
      },
      "results_by_difficulty": {
        "hard": {
          "total": 86,
          "correct": 85,
          "accuracy": 0.9883720930232558,
          "avg_response_time": 2.622892083123673
        },
        "medium": {
          "total": 100,
          "correct": 100,
          "accuracy": 1.0,
          "avg_response_time": 2.769221103191376
        },
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 1.4373576927185059
        }
      },
      "metadata": {
        "prompt_type": "step_by_step_boxed",
        "prompt_description": "Step-by-step reasoning with boxed final answer"
      }
    },
    "together/Qwen/Qwen3-235B-A22B-Instruct-2507-tput": {
      "accuracy": 0.995260663507109,
      "avg_response_time": 8.61973781156314,
      "results_by_operation": {
        "subtraction": {
          "total": 40,
          "correct": 40,
          "accuracy": 1.0,
          "avg_response_time": 9.62946674823761
        },
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 4.978518724441528
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 4.520082950592041
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 5.565825452804566
        },
        "addition": {
          "total": 60,
          "correct": 59,
          "accuracy": 0.9833333333333333,
          "avg_response_time": 13.438167989253998
        },
        "logarithm": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 9.24491810798645
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 4.718000411987305
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 3.2345713901519777
        }
      },
      "results_by_difficulty": {
        "hard": {
          "total": 86,
          "correct": 85,
          "accuracy": 0.9883720930232558,
          "avg_response_time": 14.361953444259111
        },
        "medium": {
          "total": 100,
          "correct": 100,
          "accuracy": 1.0,
          "avg_response_time": 5.226574602127076
        },
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.439168872833252
        }
      },
      "metadata": {
        "prompt_type": "step_by_step_boxed",
        "prompt_description": "Step-by-step reasoning with boxed final answer"
      }
    },
    "together/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8": {
      "accuracy": 1.0,
      "avg_response_time": 2.0175266491858315,
      "results_by_operation": {
        "subtraction": {
          "total": 40,
          "correct": 40,
          "accuracy": 1.0,
          "avg_response_time": 2.164085400104523
        },
        "multiplication": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.087602777481079
        },
        "complex": {
          "total": 1,
          "correct": 1,
          "accuracy": 1.0,
          "avg_response_time": 4.962007999420166
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 1.1959205436706544
        },
        "addition": {
          "total": 60,
          "correct": 60,
          "accuracy": 1.0,
          "avg_response_time": 2.112051486968994
        },
        "logarithm": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 2.262137508392334
        },
        "trigonometry": {
          "total": 10,
          "correct": 10,
          "accuracy": 1.0,
          "avg_response_time": 2.3785991430282594
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 1.8008839035034179
        }
      },
      "results_by_difficulty": {
        "hard": {
          "total": 86,
          "correct": 86,
          "accuracy": 1.0,
          "avg_response_time": 2.2427962020386096
        },
        "medium": {
          "total": 100,
          "correct": 100,
          "accuracy": 1.0,
          "avg_response_time": 1.9165190982818603
        },
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 1.6466295909881592
        }
      },
      "metadata": {
        "prompt_type": "step_by_step_boxed",
        "prompt_description": "Step-by-step reasoning with boxed final answer"
      }
    }
  }
}