{
  "accuracy": {
    "total_questions": 247,
    "evaluated_questions": 59,
    "correct_answers": 17,
    "fail_answers": 0,
    "unknown_answers": 3,
    "missing_predictions": [],
    "accuracy": 0.288135593220339,
    "detailed_results": [
      {
        "question_index": "question190",
        "ground_truth": "C",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question191",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question192",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question193",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question194",
        "ground_truth": "C",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question195",
        "ground_truth": "C",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question196",
        "ground_truth": "C",
        "predicted": "C",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question197",
        "ground_truth": "C",
        "predicted": "D",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question198",
        "ground_truth": "C",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question199",
        "ground_truth": "D",
        "predicted": "D",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question200",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question201",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question202",
        "ground_truth": "B",
        "predicted": "B",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question203",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question204",
        "ground_truth": "B",
        "predicted": "B",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question205",
        "ground_truth": "A",
        "predicted": "C",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question206",
        "ground_truth": "D",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question207",
        "ground_truth": "C",
        "predicted": "A",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question208",
        "ground_truth": "D",
        "predicted": "D",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question209",
        "ground_truth": "B",
        "predicted": "B",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question210",
        "ground_truth": "B",
        "predicted": "D",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question211",
        "ground_truth": "B",
        "predicted": "C",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question212",
        "ground_truth": "C",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question213",
        "ground_truth": "A",
        "predicted": "D",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question214",
        "ground_truth": "A",
        "predicted": "D",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question215",
        "ground_truth": "A",
        "predicted": "D",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question216",
        "ground_truth": "B",
        "predicted": "B",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question217",
        "ground_truth": "B",
        "predicted": "UNKNOWN",
        "correct": false,
        "status": "unknown"
      },
      {
        "question_index": "question218",
        "ground_truth": "A",
        "predicted": "UNKNOWN",
        "correct": false,
        "status": "unknown"
      },
      {
        "question_index": "question219",
        "ground_truth": "C",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question220",
        "ground_truth": "D",
        "predicted": "D",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question221",
        "ground_truth": "B",
        "predicted": "C",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question222",
        "ground_truth": "A",
        "predicted": "C",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question223",
        "ground_truth": "C",
        "predicted": "C",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question224",
        "ground_truth": "A",
        "predicted": "C",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question225",
        "ground_truth": "B",
        "predicted": "C",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question226",
        "ground_truth": "C",
        "predicted": "C",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question227",
        "ground_truth": "B",
        "predicted": "A",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question228",
        "ground_truth": "A",
        "predicted": "A",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question229",
        "ground_truth": "A",
        "predicted": "A",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question230",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question231",
        "ground_truth": "A",
        "predicted": "A",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question232",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question233",
        "ground_truth": "B",
        "predicted": "B",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question234",
        "ground_truth": "D",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question235",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question236",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question237",
        "ground_truth": "B",
        "predicted": "B",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question238",
        "ground_truth": "B",
        "predicted": "D",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question239",
        "ground_truth": "C",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question240",
        "ground_truth": "C",
        "predicted": "A",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question241",
        "ground_truth": "B",
        "predicted": "A",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question242",
        "ground_truth": "C",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question243",
        "ground_truth": "C",
        "predicted": "C",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question244",
        "ground_truth": "A",
        "predicted": "C",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question245",
        "ground_truth": "B",
        "predicted": "B",
        "correct": true,
        "status": "correct"
      },
      {
        "question_index": "question246",
        "ground_truth": "B",
        "predicted": "UNKNOWN",
        "correct": false,
        "status": "unknown"
      },
      {
        "question_index": "question247",
        "ground_truth": "A",
        "predicted": "B",
        "correct": false,
        "status": "incorrect"
      },
      {
        "question_index": "question248",
        "ground_truth": "C",
        "predicted": "D",
        "correct": false,
        "status": "incorrect"
      }
    ]
  },
  "efficiency": {
    "total_questions": 247,
    "evaluated_questions": 59,
    "efficiency_scores": [
      0.8571428571428571,
      0.5,
      0.5,
      0.0,
      0.0,
      0.0,
      0.9090909090909091,
      0.5,
      0.6,
      0.7692307692307693,
      0.6666666666666666,
      1.25,
      0.8181818181818182,
      0.2,
      0.5,
      1.0,
      0.0,
      2.75,
      1.0,
      1.0,
      0.6666666666666666,
      1.0,
      0.4,
      5.0,
      2.3,
      0.0,
      0.3333333333333333,
      8.0,
      8.0,
      0.6666666666666666,
      2.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.0,
      0.3333333333333333,
      7.0,
      0.6666666666666666,
      1.0,
      0.5,
      1.0,
      1.0,
      0.2,
      0.4,
      1.0,
      4.6,
      1.0,
      1.0,
      4.8,
      1.4,
      0.3333333333333333,
      0.3333333333333333,
      3.6666666666666665,
      3.0,
      8.0,
      2.3333333333333335,
      0.0
    ],
    "average_efficiency": 1.45345163311265,
    "detailed_results": [
      {
        "question_index": "question190",
        "gt_tool_count": 7,
        "model_tool_count": 6,
        "efficiency": 0.8571428571428571,
        "status": "evaluated"
      },
      {
        "question_index": "question191",
        "gt_tool_count": 12,
        "model_tool_count": 6,
        "efficiency": 0.5,
        "status": "evaluated"
      },
      {
        "question_index": "question192",
        "gt_tool_count": 12,
        "model_tool_count": 6,
        "efficiency": 0.5,
        "status": "evaluated"
      },
      {
        "question_index": "question193",
        "gt_tool_count": 12,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question194",
        "gt_tool_count": 10,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question195",
        "gt_tool_count": 13,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question196",
        "gt_tool_count": 11,
        "model_tool_count": 10,
        "efficiency": 0.9090909090909091,
        "status": "evaluated"
      },
      {
        "question_index": "question197",
        "gt_tool_count": 10,
        "model_tool_count": 5,
        "efficiency": 0.5,
        "status": "evaluated"
      },
      {
        "question_index": "question198",
        "gt_tool_count": 10,
        "model_tool_count": 6,
        "efficiency": 0.6,
        "status": "evaluated"
      },
      {
        "question_index": "question199",
        "gt_tool_count": 13,
        "model_tool_count": 10,
        "efficiency": 0.7692307692307693,
        "status": "evaluated"
      },
      {
        "question_index": "question200",
        "gt_tool_count": 12,
        "model_tool_count": 8,
        "efficiency": 0.6666666666666666,
        "status": "evaluated"
      },
      {
        "question_index": "question201",
        "gt_tool_count": 8,
        "model_tool_count": 10,
        "efficiency": 1.25,
        "status": "evaluated"
      },
      {
        "question_index": "question202",
        "gt_tool_count": 11,
        "model_tool_count": 9,
        "efficiency": 0.8181818181818182,
        "status": "evaluated"
      },
      {
        "question_index": "question203",
        "gt_tool_count": 10,
        "model_tool_count": 2,
        "efficiency": 0.2,
        "status": "evaluated"
      },
      {
        "question_index": "question204",
        "gt_tool_count": 4,
        "model_tool_count": 2,
        "efficiency": 0.5,
        "status": "evaluated"
      },
      {
        "question_index": "question205",
        "gt_tool_count": 4,
        "model_tool_count": 4,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question206",
        "gt_tool_count": 4,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question207",
        "gt_tool_count": 4,
        "model_tool_count": 11,
        "efficiency": 2.75,
        "status": "evaluated"
      },
      {
        "question_index": "question208",
        "gt_tool_count": 4,
        "model_tool_count": 4,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question209",
        "gt_tool_count": 4,
        "model_tool_count": 4,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question210",
        "gt_tool_count": 3,
        "model_tool_count": 2,
        "efficiency": 0.6666666666666666,
        "status": "evaluated"
      },
      {
        "question_index": "question211",
        "gt_tool_count": 5,
        "model_tool_count": 5,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question212",
        "gt_tool_count": 5,
        "model_tool_count": 2,
        "efficiency": 0.4,
        "status": "evaluated"
      },
      {
        "question_index": "question213",
        "gt_tool_count": 4,
        "model_tool_count": 20,
        "efficiency": 5.0,
        "status": "evaluated"
      },
      {
        "question_index": "question214",
        "gt_tool_count": 10,
        "model_tool_count": 23,
        "efficiency": 2.3,
        "status": "evaluated"
      },
      {
        "question_index": "question215",
        "gt_tool_count": 10,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question216",
        "gt_tool_count": 3,
        "model_tool_count": 1,
        "efficiency": 0.3333333333333333,
        "status": "evaluated"
      },
      {
        "question_index": "question217",
        "gt_tool_count": 3,
        "model_tool_count": 24,
        "efficiency": 8.0,
        "status": "evaluated"
      },
      {
        "question_index": "question218",
        "gt_tool_count": 3,
        "model_tool_count": 24,
        "efficiency": 8.0,
        "status": "evaluated"
      },
      {
        "question_index": "question219",
        "gt_tool_count": 3,
        "model_tool_count": 2,
        "efficiency": 0.6666666666666666,
        "status": "evaluated"
      },
      {
        "question_index": "question220",
        "gt_tool_count": 3,
        "model_tool_count": 6,
        "efficiency": 2.0,
        "status": "evaluated"
      },
      {
        "question_index": "question221",
        "gt_tool_count": 5,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question222",
        "gt_tool_count": 5,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question223",
        "gt_tool_count": 5,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question224",
        "gt_tool_count": 3,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question225",
        "gt_tool_count": 3,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question226",
        "gt_tool_count": 3,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      },
      {
        "question_index": "question227",
        "gt_tool_count": 3,
        "model_tool_count": 1,
        "efficiency": 0.3333333333333333,
        "status": "evaluated"
      },
      {
        "question_index": "question228",
        "gt_tool_count": 3,
        "model_tool_count": 21,
        "efficiency": 7.0,
        "status": "evaluated"
      },
      {
        "question_index": "question229",
        "gt_tool_count": 3,
        "model_tool_count": 2,
        "efficiency": 0.6666666666666666,
        "status": "evaluated"
      },
      {
        "question_index": "question230",
        "gt_tool_count": 8,
        "model_tool_count": 8,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question231",
        "gt_tool_count": 4,
        "model_tool_count": 2,
        "efficiency": 0.5,
        "status": "evaluated"
      },
      {
        "question_index": "question232",
        "gt_tool_count": 3,
        "model_tool_count": 3,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question233",
        "gt_tool_count": 5,
        "model_tool_count": 5,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question234",
        "gt_tool_count": 5,
        "model_tool_count": 1,
        "efficiency": 0.2,
        "status": "evaluated"
      },
      {
        "question_index": "question235",
        "gt_tool_count": 5,
        "model_tool_count": 2,
        "efficiency": 0.4,
        "status": "evaluated"
      },
      {
        "question_index": "question236",
        "gt_tool_count": 5,
        "model_tool_count": 5,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question237",
        "gt_tool_count": 5,
        "model_tool_count": 23,
        "efficiency": 4.6,
        "status": "evaluated"
      },
      {
        "question_index": "question238",
        "gt_tool_count": 5,
        "model_tool_count": 5,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question239",
        "gt_tool_count": 5,
        "model_tool_count": 5,
        "efficiency": 1.0,
        "status": "evaluated"
      },
      {
        "question_index": "question240",
        "gt_tool_count": 5,
        "model_tool_count": 24,
        "efficiency": 4.8,
        "status": "evaluated"
      },
      {
        "question_index": "question241",
        "gt_tool_count": 5,
        "model_tool_count": 7,
        "efficiency": 1.4,
        "status": "evaluated"
      },
      {
        "question_index": "question242",
        "gt_tool_count": 3,
        "model_tool_count": 1,
        "efficiency": 0.3333333333333333,
        "status": "evaluated"
      },
      {
        "question_index": "question243",
        "gt_tool_count": 3,
        "model_tool_count": 1,
        "efficiency": 0.3333333333333333,
        "status": "evaluated"
      },
      {
        "question_index": "question244",
        "gt_tool_count": 3,
        "model_tool_count": 11,
        "efficiency": 3.6666666666666665,
        "status": "evaluated"
      },
      {
        "question_index": "question245",
        "gt_tool_count": 3,
        "model_tool_count": 9,
        "efficiency": 3.0,
        "status": "evaluated"
      },
      {
        "question_index": "question246",
        "gt_tool_count": 3,
        "model_tool_count": 24,
        "efficiency": 8.0,
        "status": "evaluated"
      },
      {
        "question_index": "question247",
        "gt_tool_count": 3,
        "model_tool_count": 7,
        "efficiency": 2.3333333333333335,
        "status": "evaluated"
      },
      {
        "question_index": "question248",
        "gt_tool_count": 3,
        "model_tool_count": 0,
        "efficiency": 0.0,
        "status": "evaluated"
      }
    ]
  },
  "summary": {
    "total_questions": 247,
    "accuracy_rate": 0.288135593220339,
    "average_efficiency": 1.45345163311265
  }
}