{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 24,
            "Incorrect cases": 18,
            "Average distance for correct cases": 0.375,
            "Average distance for incorrect cases": 0.05555555555555555,
            "Overall average distance": 0.23809523809523808,
            "Normalized average distance for correct cases": 0.010154351821018487,
            "Normalized average distance for incorrect cases": 0.0030864197530864196,
            "Normalized overall average distance": 0.00712523807761903,
            "Correct step number predictions": 34,
            "Incorrect step number predictions": 8,
            "Step number accuracy": 0.8095238095238095,
            "Step accuracy within +-1": 0.9523809523809523,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 605370,
            "total_output_tokens": 85704,
            "total_tokens": 691074,
            "total_execution_time_sec": 955.1329
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 19,
            "Incorrect cases": 23,
            "Average distance for correct cases": 0.42105263157894735,
            "Average distance for incorrect cases": 0.08695652173913043,
            "Overall average distance": 0.23809523809523808,
            "Normalized average distance for correct cases": 0.010217462849041797,
            "Normalized average distance for incorrect cases": 0.0038647342995169077,
            "Normalized overall average distance": 0.006738587690968643,
            "Correct step number predictions": 35,
            "Incorrect step number predictions": 7,
            "Step number accuracy": 0.8333333333333334,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 605370,
            "total_output_tokens": 93438,
            "total_tokens": 698808,
            "total_execution_time_sec": 948.4022
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 25,
            "Incorrect cases": 17,
            "Average distance for correct cases": 0.4,
            "Average distance for incorrect cases": 0.058823529411764705,
            "Overall average distance": 0.2619047619047619,
            "Normalized average distance for correct cases": 0.010637066637066636,
            "Normalized average distance for incorrect cases": 0.00326797385620915,
            "Normalized overall average distance": 0.007654338606719559,
            "Correct step number predictions": 34,
            "Incorrect step number predictions": 8,
            "Step number accuracy": 0.8095238095238095,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 605370,
            "total_output_tokens": 89555,
            "total_tokens": 694925,
            "total_execution_time_sec": 919.2666
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 68,
        "overall_incorrect_cases": 58,
        "overall_total_cases": 126,
        "overall_accuracy": 0.5396825396825397,
        "overall_avg_distance_for_correct_cases": 0.39705882352941174,
        "overall_avg_distance_for_incorrect_cases": 0.06896551724137931,
        "overall_avg_distance": 0.24603174603174602,
        "overall_normalized_avg_distance_for_correct_cases": 0.010349454467101525,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.003448275862068965,
        "overall_normalized_avg_distance": 0.007172721458435743,
        "overall_correct_step_number_predictions": 103,
        "overall_incorrect_step_number_predictions": 23,
        "overall_step_number_accuracy": 0.8174603174603174,
        "overall_step_accuracy_within_+-1": 0.9365079365079365,
        "overall_step_accuracy_within_+-2": 1.0,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 1816110,
        "grand_total_output_tokens": 268697,
        "grand_total_tokens": 2084807,
        "grand_total_execution_time_sec": 2822.8017,
        "avg_prompt_tokens_per_run": 605370.0,
        "avg_output_tokens_per_run": 89565.66666666667,
        "avg_tokens_per_run": 694935.6666666666,
        "avg_execution_time_per_run_sec": 940.9339
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.5396825396825397,
            "std_dev": 0.07653691080153138,
            "variance": 0.0058578987150415705,
            "min": 0.4523809523809524,
            "max": 0.5952380952380952,
            "range": 0.14285714285714285,
            "coefficient_of_variation": 0.14181839354401404
        },
        "correct_cases": {
            "mean": 22.666666666666668,
            "std_dev": 3.2145502536643185,
            "variance": 10.333333333333334,
            "min": 19,
            "max": 25,
            "range": 6
        },
        "incorrect_cases": {
            "mean": 19.333333333333332,
            "std_dev": 3.2145502536643185,
            "variance": 10.333333333333334,
            "min": 17,
            "max": 23,
            "range": 6
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.3986842105263158,
            "std_dev": 0.023054494036756874,
            "variance": 0.0005315096952908582,
            "min": 0.375,
            "max": 0.42105263157894735,
            "range": 0.046052631578947345
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.06711186890215023,
            "std_dev": 0.017263475836181322,
            "variance": 0.0002980275979464164,
            "min": 0.05555555555555555,
            "max": 0.08695652173913043,
            "range": 0.03140096618357488
        },
        "overall_avg_distance": {
            "mean": 0.24603174603174602,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.23809523809523808,
            "max": 0.2619047619047619,
            "range": 0.023809523809523836
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.010336293769042307,
            "std_dev": 0.000262381380949993,
            "variance": 6.884398906922535e-08,
            "min": 0.010154351821018487,
            "max": 0.010637066637066636,
            "range": 0.0004827148160481496
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.0034063759696041592,
            "std_dev": 0.000407197423911888,
            "variance": 1.6580974204047782e-07,
            "min": 0.0030864197530864196,
            "max": 0.0038647342995169077,
            "range": 0.0007783145464304881
        },
        "normalized_overall_avg_distance": {
            "mean": 0.007172721458435744,
            "std_dev": 0.00045971832518945676,
            "variance": 2.1134093851499913e-07,
            "min": 0.006738587690968643,
            "max": 0.007654338606719559,
            "range": 0.000915750915750916
        },
        "correct_step_number_predictions": {
            "mean": 34.333333333333336,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 34,
            "max": 35,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 7.666666666666667,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 7,
            "max": 8,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.8174603174603174,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.8095238095238095,
            "max": 0.8333333333333334,
            "range": 0.023809523809523836
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9365079365079365,
            "std_dev": 0.013746434980705326,
            "variance": 0.00018896447467875904,
            "min": 0.9285714285714286,
            "max": 0.9523809523809523,
            "range": 0.023809523809523725
        },
        "step_accuracy_within_+-2": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 605370,
            "std_dev": 0.0,
            "variance": 0,
            "min": 605370,
            "max": 605370,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 89565.66666666667,
            "std_dev": 3867.0110335158515,
            "variance": 14953774.333333334,
            "min": 85704,
            "max": 93438,
            "range": 7734
        },
        "total_tokens": {
            "mean": 694935.6666666666,
            "std_dev": 3867.0110335158515,
            "variance": 14953774.333333334,
            "min": 691074,
            "max": 698808,
            "range": 7734
        },
        "total_execution_time_sec": {
            "mean": 940.9339,
            "std_dev": 19.06382693978306,
            "variance": 363.4294975899984,
            "min": 919.2666,
            "max": 955.1329,
            "range": 35.86629999999991
        }
    }
}