{
  "timestamp": "2025-09-15T03:51:30.916907",
  "benchmark_size": 211,
  "prompt_type": "step_by_step_boxed",
  "models_evaluated": [
    "huggingface/results/phase2/models/stage6_hard/merged_model"
  ],
  "results_summary": {
    "huggingface/results/phase2/models/stage6_hard/merged_model": {
      "accuracy": 0.7819905213270142,
      "avg_response_time": 104.68490855953705,
      "results_by_operation": {
        "addition": {
          "total": 60,
          "correct": 40,
          "accuracy": 0.6666666666666666,
          "avg_response_time": 105.51908413966497
        },
        "multiplication": {
          "total": 25,
          "correct": 18,
          "accuracy": 0.72,
          "avg_response_time": 103.95674317359925
        },
        "logarithm": {
          "total": 25,
          "correct": 22,
          "accuracy": 0.88,
          "avg_response_time": 104.02083923339843
        },
        "subtraction": {
          "total": 40,
          "correct": 27,
          "accuracy": 0.675,
          "avg_response_time": 105.19460912942887
        },
        "complex": {
          "total": 1,
          "correct": 0,
          "accuracy": 0.0,
          "avg_response_time": 108.1399884223938
        },
        "trigonometry": {
          "total": 10,
          "correct": 8,
          "accuracy": 0.8,
          "avg_response_time": 104.07256047725677
        },
        "division": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 103.92382179260254
        },
        "exponentiation": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 104.1274237728119
        }
      },
      "results_by_difficulty": {
        "easy": {
          "total": 25,
          "correct": 25,
          "accuracy": 1.0,
          "avg_response_time": 109.46750720024109
        },
        "medium": {
          "total": 100,
          "correct": 84,
          "accuracy": 0.84,
          "avg_response_time": 104.07004663228989
        },
        "hard": {
          "total": 86,
          "correct": 56,
          "accuracy": 0.6511627906976745,
          "avg_response_time": 104.00957398636396
        }
      },
      "metadata": {
        "prompt_type": "step_by_step_boxed",
        "prompt_description": "Step-by-step reasoning with boxed final answer"
      }
    }
  }
}