{
  "summary": {
    "total_instances": 168,
    "avg_turns": 13.14,
    "stuck_in_loop_percentage": 0.0,
    "overall_tool_accuracy": 81.11,
    "total_prompt_tokens": 13508659,
    "total_completion_tokens": 225002,
    "avg_prompt_tokens": 80408.68,
    "avg_completion_tokens": 1339.3,
    "classification_counts": {
      "empty_patch": 0,
      "resolved": 0,
      "unresolved": 168,
      "other": 0
    }
  },
  "instance_ids": [
    "sympy__sympy-12454",
    "django__django-13925",
    "django__django-13447",
    "django__django-11797",
    "pytest-dev__pytest-7168",
    "django__django-13448",
    "django__django-12589",
    "astropy__astropy-7746",
    "scikit-learn__scikit-learn-10949",
    "pallets__flask-5063",
    "matplotlib__matplotlib-25433",
    "sympy__sympy-18698",
    "pytest-dev__pytest-5692",
    "sympy__sympy-13146",
    "sympy__sympy-23191",
    "django__django-14997",
    "django__django-13551",
    "sympy__sympy-18199",
    "matplotlib__matplotlib-18869",
    "sympy__sympy-16988",
    "sympy__sympy-16106",
    "sympy__sympy-13895",
    "sympy__sympy-11897",
    "sympy__sympy-12481",
    "django__django-15781",
    "django__django-15902",
    "pytest-dev__pytest-5413",
    "sphinx-doc__sphinx-7738",
    "sphinx-doc__sphinx-8506",
    "matplotlib__matplotlib-24970",
    "pallets__flask-4045",
    "django__django-14534",
    "matplotlib__matplotlib-23562",
    "django__django-15498",
    "sympy__sympy-15011",
    "psf__requests-2317",
    "sympy__sympy-22005",
    "sympy__sympy-17630",
    "scikit-learn__scikit-learn-11281",
    "sympy__sympy-12236",
    "django__django-12915",
    "django__django-16816",
    "django__django-13757",
    "django__django-15061",
    "sympy__sympy-15346",
    "sympy__sympy-22714",
    "sympy__sympy-24066",
    "django__django-15252",
    "pylint-dev__pylint-7080",
    "sphinx-doc__sphinx-7975",
    "matplotlib__matplotlib-24265",
    "scikit-learn__scikit-learn-14092",
    "scikit-learn__scikit-learn-25747",
    "sympy__sympy-20590",
    "sympy__sympy-15308",
    "scikit-learn__scikit-learn-10297",
    "django__django-13220",
    "django__django-12184",
    "sympy__sympy-19487",
    "sympy__sympy-20442",
    "django__django-16408",
    "sympy__sympy-14817",
    "scikit-learn__scikit-learn-25500",
    "django__django-13033",
    "sympy__sympy-15678",
    "sympy__sympy-14317",
    "matplotlib__matplotlib-23299",
    "matplotlib__matplotlib-23987",
    "django__django-13315",
    "django__django-17051",
    "scikit-learn__scikit-learn-25570",
    "matplotlib__matplotlib-22835",
    "psf__requests-2674",
    "pytest-dev__pytest-8365",
    "matplotlib__matplotlib-22711",
    "sympy__sympy-23262",
    "astropy__astropy-6938",
    "sympy__sympy-21379",
    "sympy__sympy-12171",
    "sympy__sympy-21627",
    "matplotlib__matplotlib-24334",
    "sympy__sympy-20154",
    "django__django-11910",
    "pylint-dev__pylint-7228",
    "astropy__astropy-14365",
    "pytest-dev__pytest-8906",
    "django__django-11001",
    "django__django-12113",
    "sympy__sympy-17655",
    "django__django-13660",
    "sympy__sympy-19254",
    "pydata__xarray-4248",
    "django__django-11620",
    "django__django-13768",
    "sympy__sympy-14308",
    "mwaskom__seaborn-3407",
    "django__django-15790",
    "sympy__sympy-20049",
    "django__django-15996",
    "pytest-dev__pytest-5103",
    "django__django-12308",
    "sphinx-doc__sphinx-8273",
    "django__django-13321",
    "sympy__sympy-19007",
    "django__django-15202",
    "django__django-15400",
    "django__django-11964",
    "django__django-12470",
    "django__django-16400",
    "pallets__flask-4992",
    "pylint-dev__pylint-6506",
    "matplotlib__matplotlib-25498",
    "psf__requests-2148",
    "sympy__sympy-17022",
    "sympy__sympy-13437",
    "pytest-dev__pytest-11143",
    "django__django-11583",
    "matplotlib__matplotlib-23913",
    "django__django-14155",
    "sphinx-doc__sphinx-11445",
    "sympy__sympy-13773",
    "sympy__sympy-11870",
    "sympy__sympy-18835",
    "django__django-11848",
    "matplotlib__matplotlib-23563",
    "sympy__sympy-16792",
    "astropy__astropy-12907",
    "sympy__sympy-13177",
    "sympy__sympy-17139",
    "django__django-11742",
    "django__django-12856",
    "sympy__sympy-16281",
    "django__django-14915",
    "django__django-16046",
    "django__django-11283",
    "sympy__sympy-11400",
    "django__django-13028",
    "django__django-11019",
    "django__django-14411",
    "sympy__sympy-14024",
    "django__django-16820",
    "sympy__sympy-21612",
    "scikit-learn__scikit-learn-10508",
    "django__django-13590",
    "django__django-14730",
    "django__django-14017",
    "scikit-learn__scikit-learn-11040",
    "matplotlib__matplotlib-25332",
    "scikit-learn__scikit-learn-14983",
    "sphinx-doc__sphinx-8627",
    "sympy__sympy-15345",
    "pydata__xarray-4094",
    "pytest-dev__pytest-7220",
    "django__django-13265",
    "matplotlib__matplotlib-25079",
    "sympy__sympy-24909",
    "pylint-dev__pylint-7114",
    "pydata__xarray-3364",
    "scikit-learn__scikit-learn-14087",
    "sympy__sympy-20322",
    "django__django-15819",
    "scikit-learn__scikit-learn-13497",
    "django__django-11905",
    "sympy__sympy-22840",
    "django__django-13933",
    "psf__requests-1963",
    "sympy__sympy-13043",
    "sphinx-doc__sphinx-7686"
  ],
  "turns_list": [
    4,
    17,
    17,
    24,
    11,
    13,
    7,
    10,
    15,
    8,
    11,
    16,
    6,
    21,
    9,
    17,
    5,
    7,
    25,
    16,
    24,
    24,
    28,
    7,
    7,
    13,
    29,
    21,
    8,
    6,
    9,
    4,
    24,
    13,
    20,
    6,
    13,
    15,
    17,
    30,
    5,
    6,
    17,
    20,
    6,
    7,
    19,
    5,
    4,
    11,
    17,
    18,
    14,
    5,
    5,
    26,
    4,
    4,
    5,
    8,
    25,
    8,
    7,
    12,
    12,
    5,
    3,
    29,
    17,
    26,
    31,
    25,
    10,
    4,
    11,
    13,
    3,
    19,
    10,
    7,
    9,
    8,
    10,
    5,
    27,
    27,
    23,
    13,
    20,
    5,
    8,
    25,
    16,
    6,
    28,
    26,
    12,
    3,
    4,
    9,
    8,
    13,
    3,
    9,
    12,
    4,
    11,
    12,
    9,
    7,
    6,
    13,
    11,
    20,
    11,
    11,
    8,
    5,
    5,
    22,
    23,
    25,
    8,
    12,
    26,
    23,
    6,
    7,
    8,
    20,
    11,
    9,
    9,
    11,
    8,
    7,
    7,
    10,
    5,
    30,
    6,
    19,
    11,
    11,
    9,
    3,
    23,
    8,
    22,
    9,
    4,
    26,
    3,
    20,
    23,
    28,
    9,
    5,
    30,
    9,
    9,
    9,
    23,
    23,
    9,
    27,
    8,
    13
  ],
  "stuck_in_loop_list": [
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0,
    0
  ],
  "tool_accuracy_list": [
    100.0,
    94.12,
    94.12,
    95.83,
    100.0,
    76.92,
    100.0,
    70.0,
    86.67,
    75.0,
    100.0,
    87.5,
    50.0,
    85.71,
    88.89,
    88.24,
    80.0,
    71.43,
    88.0,
    93.75,
    100.0,
    100.0,
    100.0,
    14.29,
    100.0,
    76.92,
    93.1,
    80.95,
    100.0,
    100.0,
    44.44,
    75.0,
    100.0,
    84.62,
    95.0,
    83.33,
    69.23,
    26.67,
    70.59,
    90.0,
    100.0,
    0.0,
    94.12,
    90.0,
    100.0,
    28.57,
    36.84,
    40.0,
    100.0,
    81.82,
    70.59,
    77.78,
    100.0,
    80.0,
    80.0,
    88.46,
    0.0,
    100.0,
    80.0,
    87.5,
    92.0,
    100.0,
    85.71,
    75.0,
    41.67,
    80.0,
    100.0,
    96.55,
    94.12,
    76.92,
    100.0,
    68.0,
    10.0,
    0.0,
    90.91,
    84.62,
    100.0,
    78.95,
    100.0,
    85.71,
    66.67,
    100.0,
    80.0,
    100.0,
    88.89,
    44.44,
    91.3,
    84.62,
    25.0,
    80.0,
    87.5,
    88.0,
    25.0,
    83.33,
    96.43,
    100.0,
    33.33,
    33.33,
    75.0,
    22.22,
    100.0,
    100.0,
    0.0,
    77.78,
    41.67,
    75.0,
    72.73,
    83.33,
    100.0,
    28.57,
    50.0,
    92.31,
    27.27,
    90.0,
    90.91,
    100.0,
    50.0,
    80.0,
    80.0,
    90.91,
    78.26,
    96.0,
    37.5,
    66.67,
    100.0,
    78.26,
    83.33,
    100.0,
    100.0,
    45.0,
    54.55,
    77.78,
    88.89,
    100.0,
    100.0,
    71.43,
    100.0,
    50.0,
    60.0,
    100.0,
    83.33,
    68.42,
    90.91,
    90.91,
    33.33,
    33.33,
    86.96,
    62.5,
    95.45,
    100.0,
    100.0,
    96.15,
    100.0,
    85.0,
    86.96,
    100.0,
    55.56,
    40.0,
    100.0,
    88.89,
    100.0,
    100.0,
    69.57,
    100.0,
    0.0,
    100.0,
    87.5,
    53.85
  ],
  "prompt_tokens_list": [
    7788,
    105949,
    112887,
    112953,
    58686,
    67354,
    17635,
    27391,
    66308,
    47248,
    23261,
    122092,
    27989,
    124064,
    55302,
    95293,
    9993,
    30491,
    257074,
    37093,
    223336,
    154827,
    170393,
    48013,
    9179,
    26045,
    137152,
    109571,
    59286,
    10555,
    32004,
    18401,
    93559,
    105461,
    78396,
    23090,
    93924,
    117230,
    101699,
    220856,
    14238,
    18035,
    84531,
    159319,
    16153,
    12274,
    78599,
    13752,
    45650,
    45443,
    55489,
    105808,
    70206,
    28143,
    11846,
    349216,
    15103,
    6334,
    32755,
    66454,
    267649,
    15568,
    20794,
    101918,
    46598,
    41486,
    4658,
    162732,
    40612,
    171482,
    218692,
    122237,
    62894,
    17780,
    44935,
    107303,
    3387,
    98280,
    42937,
    40018,
    69828,
    28741,
    89744,
    13108,
    152618,
    248994,
    113872,
    50811,
    144330,
    14832,
    54454,
    185939,
    55544,
    28636,
    253264,
    142830,
    46173,
    4118,
    16630,
    103733,
    10755,
    31881,
    5661,
    53968,
    80914,
    18009,
    64298,
    83274,
    25858,
    42744,
    12796,
    32751,
    35269,
    76668,
    43129,
    38882,
    18768,
    54663,
    27303,
    96745,
    279339,
    205424,
    32400,
    63824,
    162432,
    139198,
    31084,
    9848,
    20085,
    246088,
    55239,
    87330,
    40248,
    17482,
    10287,
    39126,
    15829,
    88789,
    18551,
    177911,
    54656,
    85266,
    55414,
    70570,
    17248,
    3302,
    269560,
    15508,
    480709,
    20439,
    8550,
    118803,
    4078,
    186769,
    171709,
    198706,
    59711,
    17376,
    411029,
    60973,
    38293,
    32747,
    92551,
    189221,
    76274,
    170513,
    41427,
    43072
  ],
  "completion_tokens_list": [
    340,
    1449,
    1766,
    2492,
    946,
    723,
    817,
    677,
    1233,
    725,
    749,
    1526,
    519,
    2583,
    770,
    1414,
    159,
    590,
    2114,
    784,
    3455,
    3655,
    3349,
    669,
    492,
    1020,
    2633,
    1325,
    1404,
    525,
    408,
    236,
    1652,
    871,
    1892,
    976,
    1444,
    1624,
    1867,
    3281,
    353,
    613,
    2010,
    1761,
    1342,
    559,
    1060,
    357,
    433,
    1056,
    927,
    1370,
    819,
    763,
    641,
    1624,
    389,
    634,
    161,
    1040,
    3917,
    741,
    403,
    1696,
    802,
    600,
    335,
    3213,
    1170,
    4755,
    1393,
    1656,
    2449,
    342,
    304,
    2124,
    142,
    1968,
    223,
    453,
    1015,
    615,
    2494,
    177,
    1638,
    3742,
    1525,
    1152,
    2942,
    425,
    965,
    3052,
    1518,
    242,
    2096,
    2223,
    4877,
    472,
    410,
    1710,
    551,
    1206,
    140,
    1345,
    2726,
    324,
    1148,
    2148,
    802,
    517,
    416,
    636,
    1087,
    2332,
    1252,
    994,
    365,
    930,
    723,
    2879,
    4138,
    2913,
    1185,
    1143,
    1784,
    1861,
    838,
    381,
    635,
    1151,
    598,
    672,
    631,
    387,
    437,
    1105,
    260,
    977,
    234,
    3171,
    510,
    1510,
    754,
    1189,
    680,
    458,
    1672,
    219,
    3544,
    938,
    395,
    1346,
    516,
    1466,
    2585,
    3346,
    1924,
    1181,
    3491,
    1686,
    2333,
    623,
    1312,
    4244,
    1177,
    1649,
    822,
    1368
  ],
  "category_breakdown": {
    "empty_patch": {
      "instance_ids": [],
      "turns_list": [],
      "stuck_in_loop_list": [],
      "tool_accuracy_list": [],
      "prompt_tokens_list": [],
      "completion_tokens_list": [],
      "averages": {
        "avg_turns": 0,
        "avg_stuck_in_loop_rate": 0,
        "avg_tool_accuracy": 0,
        "avg_prompt_tokens": 0,
        "avg_completion_tokens": 0
      }
    },
    "resolved": {
      "instance_ids": [],
      "turns_list": [],
      "stuck_in_loop_list": [],
      "tool_accuracy_list": [],
      "prompt_tokens_list": [],
      "completion_tokens_list": [],
      "averages": {
        "avg_turns": 0,
        "avg_stuck_in_loop_rate": 0,
        "avg_tool_accuracy": 0,
        "avg_prompt_tokens": 0,
        "avg_completion_tokens": 0
      }
    },
    "unresolved": {
      "instance_ids": [
        "sympy__sympy-12454",
        "django__django-13925",
        "django__django-13447",
        "django__django-11797",
        "pytest-dev__pytest-7168",
        "django__django-13448",
        "django__django-12589",
        "astropy__astropy-7746",
        "scikit-learn__scikit-learn-10949",
        "pallets__flask-5063",
        "matplotlib__matplotlib-25433",
        "sympy__sympy-18698",
        "pytest-dev__pytest-5692",
        "sympy__sympy-13146",
        "sympy__sympy-23191",
        "django__django-14997",
        "django__django-13551",
        "sympy__sympy-18199",
        "matplotlib__matplotlib-18869",
        "sympy__sympy-16988",
        "sympy__sympy-16106",
        "sympy__sympy-13895",
        "sympy__sympy-11897",
        "sympy__sympy-12481",
        "django__django-15781",
        "django__django-15902",
        "pytest-dev__pytest-5413",
        "sphinx-doc__sphinx-7738",
        "sphinx-doc__sphinx-8506",
        "matplotlib__matplotlib-24970",
        "pallets__flask-4045",
        "django__django-14534",
        "matplotlib__matplotlib-23562",
        "django__django-15498",
        "sympy__sympy-15011",
        "psf__requests-2317",
        "sympy__sympy-22005",
        "sympy__sympy-17630",
        "scikit-learn__scikit-learn-11281",
        "sympy__sympy-12236",
        "django__django-12915",
        "django__django-16816",
        "django__django-13757",
        "django__django-15061",
        "sympy__sympy-15346",
        "sympy__sympy-22714",
        "sympy__sympy-24066",
        "django__django-15252",
        "pylint-dev__pylint-7080",
        "sphinx-doc__sphinx-7975",
        "matplotlib__matplotlib-24265",
        "scikit-learn__scikit-learn-14092",
        "scikit-learn__scikit-learn-25747",
        "sympy__sympy-20590",
        "sympy__sympy-15308",
        "scikit-learn__scikit-learn-10297",
        "django__django-13220",
        "django__django-12184",
        "sympy__sympy-19487",
        "sympy__sympy-20442",
        "django__django-16408",
        "sympy__sympy-14817",
        "scikit-learn__scikit-learn-25500",
        "django__django-13033",
        "sympy__sympy-15678",
        "sympy__sympy-14317",
        "matplotlib__matplotlib-23299",
        "matplotlib__matplotlib-23987",
        "django__django-13315",
        "django__django-17051",
        "scikit-learn__scikit-learn-25570",
        "matplotlib__matplotlib-22835",
        "psf__requests-2674",
        "pytest-dev__pytest-8365",
        "matplotlib__matplotlib-22711",
        "sympy__sympy-23262",
        "astropy__astropy-6938",
        "sympy__sympy-21379",
        "sympy__sympy-12171",
        "sympy__sympy-21627",
        "matplotlib__matplotlib-24334",
        "sympy__sympy-20154",
        "django__django-11910",
        "pylint-dev__pylint-7228",
        "astropy__astropy-14365",
        "pytest-dev__pytest-8906",
        "django__django-11001",
        "django__django-12113",
        "sympy__sympy-17655",
        "django__django-13660",
        "sympy__sympy-19254",
        "pydata__xarray-4248",
        "django__django-11620",
        "django__django-13768",
        "sympy__sympy-14308",
        "mwaskom__seaborn-3407",
        "django__django-15790",
        "sympy__sympy-20049",
        "django__django-15996",
        "pytest-dev__pytest-5103",
        "django__django-12308",
        "sphinx-doc__sphinx-8273",
        "django__django-13321",
        "sympy__sympy-19007",
        "django__django-15202",
        "django__django-15400",
        "django__django-11964",
        "django__django-12470",
        "django__django-16400",
        "pallets__flask-4992",
        "pylint-dev__pylint-6506",
        "matplotlib__matplotlib-25498",
        "psf__requests-2148",
        "sympy__sympy-17022",
        "sympy__sympy-13437",
        "pytest-dev__pytest-11143",
        "django__django-11583",
        "matplotlib__matplotlib-23913",
        "django__django-14155",
        "sphinx-doc__sphinx-11445",
        "sympy__sympy-13773",
        "sympy__sympy-11870",
        "sympy__sympy-18835",
        "django__django-11848",
        "matplotlib__matplotlib-23563",
        "sympy__sympy-16792",
        "astropy__astropy-12907",
        "sympy__sympy-13177",
        "sympy__sympy-17139",
        "django__django-11742",
        "django__django-12856",
        "sympy__sympy-16281",
        "django__django-14915",
        "django__django-16046",
        "django__django-11283",
        "sympy__sympy-11400",
        "django__django-13028",
        "django__django-11019",
        "django__django-14411",
        "sympy__sympy-14024",
        "django__django-16820",
        "sympy__sympy-21612",
        "scikit-learn__scikit-learn-10508",
        "django__django-13590",
        "django__django-14730",
        "django__django-14017",
        "scikit-learn__scikit-learn-11040",
        "matplotlib__matplotlib-25332",
        "scikit-learn__scikit-learn-14983",
        "sphinx-doc__sphinx-8627",
        "sympy__sympy-15345",
        "pydata__xarray-4094",
        "pytest-dev__pytest-7220",
        "django__django-13265",
        "matplotlib__matplotlib-25079",
        "sympy__sympy-24909",
        "pylint-dev__pylint-7114",
        "pydata__xarray-3364",
        "scikit-learn__scikit-learn-14087",
        "sympy__sympy-20322",
        "django__django-15819",
        "scikit-learn__scikit-learn-13497",
        "django__django-11905",
        "sympy__sympy-22840",
        "django__django-13933",
        "psf__requests-1963",
        "sympy__sympy-13043",
        "sphinx-doc__sphinx-7686"
      ],
      "turns_list": [
        4,
        17,
        17,
        24,
        11,
        13,
        7,
        10,
        15,
        8,
        11,
        16,
        6,
        21,
        9,
        17,
        5,
        7,
        25,
        16,
        24,
        24,
        28,
        7,
        7,
        13,
        29,
        21,
        8,
        6,
        9,
        4,
        24,
        13,
        20,
        6,
        13,
        15,
        17,
        30,
        5,
        6,
        17,
        20,
        6,
        7,
        19,
        5,
        4,
        11,
        17,
        18,
        14,
        5,
        5,
        26,
        4,
        4,
        5,
        8,
        25,
        8,
        7,
        12,
        12,
        5,
        3,
        29,
        17,
        26,
        31,
        25,
        10,
        4,
        11,
        13,
        3,
        19,
        10,
        7,
        9,
        8,
        10,
        5,
        27,
        27,
        23,
        13,
        20,
        5,
        8,
        25,
        16,
        6,
        28,
        26,
        12,
        3,
        4,
        9,
        8,
        13,
        3,
        9,
        12,
        4,
        11,
        12,
        9,
        7,
        6,
        13,
        11,
        20,
        11,
        11,
        8,
        5,
        5,
        22,
        23,
        25,
        8,
        12,
        26,
        23,
        6,
        7,
        8,
        20,
        11,
        9,
        9,
        11,
        8,
        7,
        7,
        10,
        5,
        30,
        6,
        19,
        11,
        11,
        9,
        3,
        23,
        8,
        22,
        9,
        4,
        26,
        3,
        20,
        23,
        28,
        9,
        5,
        30,
        9,
        9,
        9,
        23,
        23,
        9,
        27,
        8,
        13
      ],
      "stuck_in_loop_list": [
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0
      ],
      "tool_accuracy_list": [
        100.0,
        94.12,
        94.12,
        95.83,
        100.0,
        76.92,
        100.0,
        70.0,
        86.67,
        75.0,
        100.0,
        87.5,
        50.0,
        85.71,
        88.89,
        88.24,
        80.0,
        71.43,
        88.0,
        93.75,
        100.0,
        100.0,
        100.0,
        14.29,
        100.0,
        76.92,
        93.1,
        80.95,
        100.0,
        100.0,
        44.44,
        75.0,
        100.0,
        84.62,
        95.0,
        83.33,
        69.23,
        26.67,
        70.59,
        90.0,
        100.0,
        0.0,
        94.12,
        90.0,
        100.0,
        28.57,
        36.84,
        40.0,
        100.0,
        81.82,
        70.59,
        77.78,
        100.0,
        80.0,
        80.0,
        88.46,
        0.0,
        100.0,
        80.0,
        87.5,
        92.0,
        100.0,
        85.71,
        75.0,
        41.67,
        80.0,
        100.0,
        96.55,
        94.12,
        76.92,
        100.0,
        68.0,
        10.0,
        0.0,
        90.91,
        84.62,
        100.0,
        78.95,
        100.0,
        85.71,
        66.67,
        100.0,
        80.0,
        100.0,
        88.89,
        44.44,
        91.3,
        84.62,
        25.0,
        80.0,
        87.5,
        88.0,
        25.0,
        83.33,
        96.43,
        100.0,
        33.33,
        33.33,
        75.0,
        22.22,
        100.0,
        100.0,
        0.0,
        77.78,
        41.67,
        75.0,
        72.73,
        83.33,
        100.0,
        28.57,
        50.0,
        92.31,
        27.27,
        90.0,
        90.91,
        100.0,
        50.0,
        80.0,
        80.0,
        90.91,
        78.26,
        96.0,
        37.5,
        66.67,
        100.0,
        78.26,
        83.33,
        100.0,
        100.0,
        45.0,
        54.55,
        77.78,
        88.89,
        100.0,
        100.0,
        71.43,
        100.0,
        50.0,
        60.0,
        100.0,
        83.33,
        68.42,
        90.91,
        90.91,
        33.33,
        33.33,
        86.96,
        62.5,
        95.45,
        100.0,
        100.0,
        96.15,
        100.0,
        85.0,
        86.96,
        100.0,
        55.56,
        40.0,
        100.0,
        88.89,
        100.0,
        100.0,
        69.57,
        100.0,
        0.0,
        100.0,
        87.5,
        53.85
      ],
      "prompt_tokens_list": [
        7788,
        105949,
        112887,
        112953,
        58686,
        67354,
        17635,
        27391,
        66308,
        47248,
        23261,
        122092,
        27989,
        124064,
        55302,
        95293,
        9993,
        30491,
        257074,
        37093,
        223336,
        154827,
        170393,
        48013,
        9179,
        26045,
        137152,
        109571,
        59286,
        10555,
        32004,
        18401,
        93559,
        105461,
        78396,
        23090,
        93924,
        117230,
        101699,
        220856,
        14238,
        18035,
        84531,
        159319,
        16153,
        12274,
        78599,
        13752,
        45650,
        45443,
        55489,
        105808,
        70206,
        28143,
        11846,
        349216,
        15103,
        6334,
        32755,
        66454,
        267649,
        15568,
        20794,
        101918,
        46598,
        41486,
        4658,
        162732,
        40612,
        171482,
        218692,
        122237,
        62894,
        17780,
        44935,
        107303,
        3387,
        98280,
        42937,
        40018,
        69828,
        28741,
        89744,
        13108,
        152618,
        248994,
        113872,
        50811,
        144330,
        14832,
        54454,
        185939,
        55544,
        28636,
        253264,
        142830,
        46173,
        4118,
        16630,
        103733,
        10755,
        31881,
        5661,
        53968,
        80914,
        18009,
        64298,
        83274,
        25858,
        42744,
        12796,
        32751,
        35269,
        76668,
        43129,
        38882,
        18768,
        54663,
        27303,
        96745,
        279339,
        205424,
        32400,
        63824,
        162432,
        139198,
        31084,
        9848,
        20085,
        246088,
        55239,
        87330,
        40248,
        17482,
        10287,
        39126,
        15829,
        88789,
        18551,
        177911,
        54656,
        85266,
        55414,
        70570,
        17248,
        3302,
        269560,
        15508,
        480709,
        20439,
        8550,
        118803,
        4078,
        186769,
        171709,
        198706,
        59711,
        17376,
        411029,
        60973,
        38293,
        32747,
        92551,
        189221,
        76274,
        170513,
        41427,
        43072
      ],
      "completion_tokens_list": [
        340,
        1449,
        1766,
        2492,
        946,
        723,
        817,
        677,
        1233,
        725,
        749,
        1526,
        519,
        2583,
        770,
        1414,
        159,
        590,
        2114,
        784,
        3455,
        3655,
        3349,
        669,
        492,
        1020,
        2633,
        1325,
        1404,
        525,
        408,
        236,
        1652,
        871,
        1892,
        976,
        1444,
        1624,
        1867,
        3281,
        353,
        613,
        2010,
        1761,
        1342,
        559,
        1060,
        357,
        433,
        1056,
        927,
        1370,
        819,
        763,
        641,
        1624,
        389,
        634,
        161,
        1040,
        3917,
        741,
        403,
        1696,
        802,
        600,
        335,
        3213,
        1170,
        4755,
        1393,
        1656,
        2449,
        342,
        304,
        2124,
        142,
        1968,
        223,
        453,
        1015,
        615,
        2494,
        177,
        1638,
        3742,
        1525,
        1152,
        2942,
        425,
        965,
        3052,
        1518,
        242,
        2096,
        2223,
        4877,
        472,
        410,
        1710,
        551,
        1206,
        140,
        1345,
        2726,
        324,
        1148,
        2148,
        802,
        517,
        416,
        636,
        1087,
        2332,
        1252,
        994,
        365,
        930,
        723,
        2879,
        4138,
        2913,
        1185,
        1143,
        1784,
        1861,
        838,
        381,
        635,
        1151,
        598,
        672,
        631,
        387,
        437,
        1105,
        260,
        977,
        234,
        3171,
        510,
        1510,
        754,
        1189,
        680,
        458,
        1672,
        219,
        3544,
        938,
        395,
        1346,
        516,
        1466,
        2585,
        3346,
        1924,
        1181,
        3491,
        1686,
        2333,
        623,
        1312,
        4244,
        1177,
        1649,
        822,
        1368
      ],
      "averages": {
        "avg_turns": 13.14,
        "avg_stuck_in_loop_rate": 0.0,
        "avg_tool_accuracy": 77.13,
        "avg_prompt_tokens": 80408.68,
        "avg_completion_tokens": 1339.3
      }
    },
    "other": {
      "instance_ids": [],
      "turns_list": [],
      "stuck_in_loop_list": [],
      "tool_accuracy_list": [],
      "prompt_tokens_list": [],
      "completion_tokens_list": [],
      "averages": {
        "avg_turns": 0,
        "avg_stuck_in_loop_rate": 0,
        "avg_tool_accuracy": 0,
        "avg_prompt_tokens": 0,
        "avg_completion_tokens": 0
      }
    }
  },
  "timestamp": "2025-07-29T14:42:42.150673",
  "intra_correctness_list": [
    29.25,
    33.294117647058826,
    43.294117647058826,
    40.416666666666664,
    43.54545454545455,
    28.76923076923077,
    62.142857142857146,
    48.9,
    43.93333333333333,
    46.75,
    52,
    44.625,
    70.33333333333333,
    24.142857142857142,
    38.888888888888886,
    27,
    39.8,
    39.57142857142857,
    69.2,
    33.625,
    29.5,
    53.291666666666664,
    40.464285714285715,
    60,
    24.857142857142858,
    36.84615384615385,
    25.724137931034484,
    39.42857142857143,
    56.75,
    26.166666666666668,
    45.44444444444444,
    35.75,
    26.791666666666668,
    33,
    31.15,
    44,
    47.61538461538461,
    41.86666666666667,
    45.35294117647059,
    61.233333333333334,
    42.4,
    55.666666666666664,
    37,
    35.1,
    60.166666666666664,
    50.142857142857146,
    27.263157894736842,
    41,
    29.25,
    33.09090909090909,
    36.470588235294116,
    31.944444444444443,
    38.07142857142857,
    49.8,
    41.2,
    26.26923076923077,
    42.75,
    51.25,
    45,
    44.625,
    27.32,
    46,
    42.142857142857146,
    24.5,
    37.583333333333336,
    32,
    35.666666666666664,
    35.58620689655172,
    40.64705882352941,
    33.84615384615385,
    30.29032258064516,
    26.8,
    48.1,
    19.25,
    38.72727272727273,
    47,
    55,
    41.78947368421053,
    41.8,
    34.285714285714285,
    43.333333333333336,
    43,
    28.1,
    44,
    29.25925925925926,
    28.22222222222222,
    36.608695652173914,
    30.53846153846154,
    31.35,
    59,
    52.125,
    27.24,
    41.625,
    48.333333333333336,
    35.642857142857146,
    31.884615384615383,
    48.583333333333336,
    70.66666666666667,
    64.25,
    54.333333333333336,
    52.25,
    42.15384615384615,
    85,
    58.55555555555556,
    33.083333333333336,
    57.5,
    36.54545454545455,
    37.25,
    38.888888888888886,
    37.285714285714285,
    45.333333333333336,
    38.92307692307692,
    33.27272727272727,
    29.1,
    30.636363636363637,
    16.09090909090909,
    41,
    68.2,
    67.8,
    45.22727272727273,
    17.91304347826087,
    41.56,
    50.875,
    52.583333333333336,
    29.923076923076923,
    39.17391304347826,
    47.333333333333336,
    50.142857142857146,
    54.25,
    30.9,
    37.36363636363637,
    52,
    28.333333333333332,
    24.636363636363637,
    26.125,
    47.714285714285715,
    50,
    50.7,
    20,
    54.6,
    34.833333333333336,
    35.421052631578945,
    26.818181818181817,
    33.72727272727273,
    64.33333333333333,
    60.666666666666664,
    31.043478260869566,
    42.75,
    35.95454545454545,
    31.88888888888889,
    46.75,
    36.57692307692308,
    54,
    34.95,
    57.65217391304348,
    50.5,
    33.55555555555556,
    38.8,
    15.933333333333334,
    39.333333333333336,
    60.22222222222222,
    37.22222222222222,
    42.65217391304348,
    21.73913043478261,
    48.111111111111114,
    57.074074074074076,
    56.875,
    38.61538461538461
  ],
  "inter_correctness_list": [
    57.5,
    72.05882352941177,
    74.70588235294117,
    71.25,
    61.36363636363637,
    51.53846153846154,
    93.57142857142857,
    55.5,
    66.33333333333333,
    81.25,
    67.27272727272727,
    68.75,
    90,
    66.42857142857143,
    55,
    61.1764705882353,
    93,
    50.714285714285715,
    86,
    48.4375,
    83.125,
    72.5,
    77.85714285714286,
    79.28571428571429,
    75.71428571428571,
    77.3076923076923,
    62.241379310344826,
    60,
    70,
    40,
    78.33333333333333,
    87.5,
    77.91666666666667,
    66.53846153846153,
    72,
    85.83333333333333,
    78.46153846153847,
    70,
    80.58823529411765,
    78.33333333333333,
    75,
    65,
    82.05882352941177,
    70.75,
    70,
    60.714285714285715,
    69.73684210526316,
    79,
    45,
    62.27272727272727,
    60.294117647058826,
    73.33333333333333,
    87.5,
    73,
    63,
    80.1923076923077,
    90,
    63.75,
    81,
    86.875,
    75.6,
    90,
    37.857142857142854,
    65.41666666666667,
    26.666666666666668,
    81,
    75,
    77.58620689655173,
    80.29411764705883,
    66.15384615384616,
    88.06451612903226,
    74,
    79,
    70,
    48.63636363636363,
    46.53846153846154,
    63.333333333333336,
    68.15789473684211,
    45.5,
    67.85714285714286,
    42.77777777777778,
    61.25,
    67,
    54,
    76.85185185185185,
    59.81481481481482,
    86.08695652173913,
    76.15384615384616,
    77.25,
    75,
    84.375,
    72,
    75,
    86.66666666666667,
    87.32142857142857,
    81.34615384615384,
    49.166666666666664,
    71.66666666666667,
    82.5,
    67.77777777777777,
    74.375,
    65.38461538461539,
    94,
    83.33333333333333,
    70.41666666666667,
    75,
    48.63636363636363,
    67.91666666666667,
    56.666666666666664,
    81.42857142857143,
    51.666666666666664,
    60,
    51.81818181818182,
    53.5,
    48.18181818181818,
    67.72727272727273,
    78.125,
    83,
    85,
    63.18181818181818,
    87.3913043478261,
    76,
    71.875,
    61.25,
    75.1923076923077,
    62.17391304347826,
    59.166666666666664,
    45,
    58.75,
    55,
    59.09090909090909,
    51.666666666666664,
    50,
    40.90909090909091,
    73.75,
    68.57142857142857,
    76.42857142857143,
    57,
    62,
    80.83333333333333,
    71.66666666666667,
    63.1578947368421,
    73.18181818181819,
    83.18181818181819,
    80.55555555555556,
    60,
    61.73913043478261,
    57.5,
    84.54545454545455,
    48.333333333333336,
    68.75,
    62.5,
    65,
    58.25,
    74.1304347826087,
    90.71428571428571,
    57.77777777777778,
    75,
    86.16666666666667,
    49.44444444444444,
    82.22222222222223,
    65.55555555555556,
    64.56521739130434,
    53.69565217391305,
    84.44444444444444,
    82.5925925925926,
    65,
    43.07692307692308
  ],
  "informativeness_list": [
    68.75,
    74.70588235294117,
    57.94117647058823,
    65.20833333333333,
    68.18181818181819,
    52.30769230769231,
    67.85714285714286,
    59.5,
    59.666666666666664,
    60,
    72.27272727272727,
    60.3125,
    88.33333333333333,
    49.285714285714285,
    69.44444444444444,
    37.64705882352941,
    60,
    47.857142857142854,
    54.8,
    57.1875,
    62.5,
    63.333333333333336,
    64.82142857142857,
    83.57142857142857,
    76.42857142857143,
    70.76923076923077,
    28.275862068965516,
    55,
    70.625,
    69.16666666666667,
    59.44444444444444,
    56.25,
    35.208333333333336,
    43.46153846153846,
    45.5,
    72.5,
    54.61538461538461,
    72,
    74.70588235294117,
    50.166666666666664,
    68,
    85,
    55,
    45.75,
    74.16666666666667,
    71.42857142857143,
    49.73684210526316,
    64,
    47.5,
    61.36363636363637,
    67.05882352941177,
    57.5,
    66.42857142857143,
    72,
    85,
    24.807692307692307,
    47.5,
    66.25,
    70,
    60.625,
    58.4,
    78.75,
    46.42857142857143,
    48.333333333333336,
    61.25,
    72,
    63.333333333333336,
    41.89655172413793,
    43.529411764705884,
    43.46153846153846,
    37.25806451612903,
    38.2,
    64,
    58.75,
    43.63636363636363,
    73.07692307692308,
    38.333333333333336,
    37.36842105263158,
    61.5,
    63.57142857142857,
    58.888888888888886,
    58.75,
    64.5,
    77,
    32.03703703703704,
    36.48148148148148,
    47.391304347826086,
    45,
    61.5,
    85,
    73.125,
    53.2,
    65,
    83.33333333333333,
    42.5,
    34.42307692307692,
    77.91666666666667,
    66.66666666666667,
    87.5,
    70.55555555555556,
    68.75,
    56.92307692307692,
    85,
    68.33333333333333,
    39.166666666666664,
    76.25,
    53.63636363636363,
    62.5,
    57.22222222222222,
    49.285714285714285,
    80.83333333333333,
    47.30769230769231,
    68.18181818181819,
    40,
    67.72727272727273,
    62.27272727272727,
    75.625,
    67,
    68,
    54.77272727272727,
    18.47826086956522,
    53,
    58.75,
    63.333333333333336,
    50.19230769230769,
    54.34782608695652,
    75.83333333333333,
    75.71428571428571,
    75.625,
    53.5,
    58.18181818181818,
    72.22222222222223,
    49.44444444444444,
    53.18181818181818,
    63.125,
    80.71428571428571,
    39.285714285714285,
    76,
    78,
    54.166666666666664,
    57.5,
    61.8421052631579,
    65.9090909090909,
    55.90909090909091,
    80.55555555555556,
    88.33333333333333,
    42.391304347826086,
    54.375,
    43.18181818181818,
    57.22222222222222,
    80,
    43.84615384615385,
    60,
    47,
    77.17391304347827,
    34.464285714285715,
    79.44444444444444,
    56,
    28.5,
    70.55555555555556,
    70.55555555555556,
    61.111111111111114,
    63.47826086956522,
    58.34782608695652,
    74.44444444444444,
    60.925925925925924,
    80,
    58.07692307692308
  ]
}