{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 12,
            "Incorrect cases": 17,
            "Average distance for correct cases": 1.1666666666666667,
            "Average distance for incorrect cases": 3.823529411764706,
            "Overall average distance": 2.7241379310344827,
            "Normalized average distance for correct cases": 0.03150448585231194,
            "Normalized average distance for incorrect cases": 0.11446324143692564,
            "Normalized overall average distance": 0.08013548050536136,
            "Correct step number predictions": 14,
            "Incorrect step number predictions": 15,
            "Step number accuracy": 0.4482758620689655,
            "Step accuracy within +-1": 0.5172413793103449,
            "Step accuracy within +-2": 0.5862068965517241,
            "Step accuracy within +-3": 0.6896551724137931,
            "Step accuracy within +-4": 0.7241379310344828,
            "Step accuracy within +-5": 0.7931034482758621,
            "total_prompt_tokens": 327354,
            "total_output_tokens": 74121,
            "total_tokens": 401475,
            "total_execution_time_sec": 631.0865
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 11,
            "Incorrect cases": 18,
            "Average distance for correct cases": 2.6,
            "Average distance for incorrect cases": 3.5789473684210527,
            "Overall average distance": 3.2413793103448274,
            "Normalized average distance for correct cases": 0.04530154277699859,
            "Normalized average distance for incorrect cases": 0.10201926036413571,
            "Normalized overall average distance": 0.08246142671339876,
            "Correct step number predictions": 14,
            "Incorrect step number predictions": 15,
            "Step number accuracy": 0.4827586206896552,
            "Step accuracy within +-1": 0.5172413793103449,
            "Step accuracy within +-2": 0.5862068965517241,
            "Step accuracy within +-3": 0.6551724137931034,
            "Step accuracy within +-4": 0.7241379310344828,
            "Step accuracy within +-5": 0.7931034482758621,
            "total_prompt_tokens": 327354,
            "total_output_tokens": 74872,
            "total_tokens": 402226,
            "total_execution_time_sec": 579.3478
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.8,
            "Average distance for incorrect cases": 3.4210526315789473,
            "Overall average distance": 2.5172413793103448,
            "Normalized average distance for correct cases": 0.017805383022774325,
            "Normalized average distance for incorrect cases": 0.09895730683778589,
            "Normalized overall average distance": 0.0709738848326095,
            "Correct step number predictions": 13,
            "Incorrect step number predictions": 16,
            "Step number accuracy": 0.4482758620689655,
            "Step accuracy within +-1": 0.5172413793103449,
            "Step accuracy within +-2": 0.5862068965517241,
            "Step accuracy within +-3": 0.6896551724137931,
            "Step accuracy within +-4": 0.7931034482758621,
            "Step accuracy within +-5": 0.8620689655172413,
            "total_prompt_tokens": 327354,
            "total_output_tokens": 69736,
            "total_tokens": 397090,
            "total_execution_time_sec": 532.8668
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 33,
        "overall_incorrect_cases": 54,
        "overall_total_cases": 87,
        "overall_accuracy": 0.3793103448275862,
        "overall_avg_distance_for_correct_cases": 1.5,
        "overall_avg_distance_for_incorrect_cases": 3.6,
        "overall_avg_distance": 2.8275862068965516,
        "overall_normalized_avg_distance_for_correct_cases": 0.031535096507046015,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.10480781602298629,
        "overall_normalized_avg_distance": 0.07785693068378988,
        "overall_correct_step_number_predictions": 41,
        "overall_incorrect_step_number_predictions": 46,
        "overall_step_number_accuracy": 0.47126436781609195,
        "overall_step_accuracy_within_+-1": 0.5172413793103449,
        "overall_step_accuracy_within_+-2": 0.5862068965517241,
        "overall_step_accuracy_within_+-3": 0.67816091954023,
        "overall_step_accuracy_within_+-4": 0.7471264367816092,
        "overall_step_accuracy_within_+-5": 0.8160919540229885,
        "grand_total_prompt_tokens": 982062,
        "grand_total_output_tokens": 218729,
        "grand_total_tokens": 1200791,
        "grand_total_execution_time_sec": 1743.3011,
        "avg_prompt_tokens_per_run": 327354.0,
        "avg_output_tokens_per_run": 72909.66666666667,
        "avg_tokens_per_run": 400263.6666666667,
        "avg_execution_time_per_run_sec": 581.1004
    },
    "stability_metrics": {
        "accuracy": {
            "mean":  0.3793103448275862,
            "std_dev": 0.0398172599441121,
            "variance": 0.0015854141894569944,
            "min": 0.3448275862068966,
            "max": 0.41379310344827586,
            "range": 0.06896551724137928,
            "coefficient_of_variation": 0.10825317547305478
        },
        "correct_cases": {
            "mean": 10.666666666666666,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 10,
            "max": 12,
            "range": 2
        },
        "incorrect_cases": {
            "mean": 18.333333333333332,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 17,
            "max": 19,
            "range": 2
        },
        "avg_distance_for_correct_cases": {
            "mean": 1.5222222222222224,
            "std_dev": 0.9512175433699774,
            "variance": 0.9048148148148148,
            "min": 0.8,
            "max": 2.6,
            "range": 1.8
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 3.607843137254902,
            "std_dev": 0.20278834714930818,
            "variance": 0.04112311373954833,
            "min": 3.4210526315789473,
            "max": 3.823529411764706,
            "range": 0.4024767801857587
        },
        "overall_avg_distance": {
            "mean": 2.8275862068965516,
            "std_dev": 0.3729880629790333,
            "variance": 0.13912009512485132,
            "min": 2.5172413793103448,
            "max": 3.2413793103448274,
            "range": 0.7241379310344827
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.03153713721736162,
            "std_dev": 0.013748108956914229,
            "variance": 0.00018901049989118526,
            "min": 0.017805383022774325,
            "max": 0.04530154277699859,
            "range": 0.027496159754224266
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.10514660287961575,
            "std_dev": 0.008212411664976773,
            "variance": 6.744370535504657e-05,
            "min": 0.09895730683778589,
            "max": 0.11446324143692564,
            "range": 0.01550593459913975
        },
        "normalized_overall_avg_distance": {
            "mean": 0.07785693068378988,
            "std_dev": 0.006073281368647383,
            "variance": 3.688474658275943e-05,
            "min": 0.0709738848326095,
            "max": 0.08246142671339876,
            "range": 0.011487541880789265
        },
        "correct_step_number_predictions": {
            "mean": 13.333333333333334,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 13,
            "max": 14,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 15.666666666666666,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 15,
            "max": 16,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.47126436781609195,
            "std_dev": 0.019908629972056068,
            "variance": 0.0003963535473642492,
            "min": 0.4482758620689655,
            "max": 0.4827586206896552,
            "range": 0.03448275862068967
        },
        "step_accuracy_within_+-1": {
            "mean": 0.5172413793103449,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.5172413793103449,
            "max": 0.5172413793103449,
            "range": 0.0
        },
        "step_accuracy_within_+-2": {
            "mean": 0.5862068965517241,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.5862068965517241,
            "max": 0.5862068965517241,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 0.67816091954023,
            "std_dev": 0.0199086299720561,
            "variance": 0.0003963535473642505,
            "min": 0.6551724137931034,
            "max": 0.6896551724137931,
            "range": 0.034482758620689724
        },
        "step_accuracy_within_+-4": {
            "mean": 0.7471264367816092,
            "std_dev": 0.039817259944112136,
            "variance": 0.0015854141894569968,
            "min": 0.7241379310344828,
            "max": 0.7931034482758621,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-5": {
            "mean": 0.8160919540229885,
            "std_dev": 0.039817259944112074,
            "variance": 0.0015854141894569918,
            "min": 0.7931034482758621,
            "max": 0.8620689655172413,
            "range": 0.06896551724137923
        },
        "total_prompt_tokens": {
            "mean": 327354,
            "std_dev": 0.0,
            "variance": 0,
            "min": 327354,
            "max": 327354,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 72909.66666666667,
            "std_dev": 2774.0079908560706,
            "variance": 7695120.333333333,
            "min": 69736,
            "max": 74872,
            "range": 5136
        },
        "total_tokens": {
            "mean": 400263.6666666667,
            "std_dev": 2774.0079908560706,
            "variance": 7695120.333333333,
            "min": 397090,
            "max": 402226,
            "range": 5136
        },
        "total_execution_time_sec": {
            "mean": 581.1003666666667,
            "std_dev": 49.13329812320086,
            "variance": 2414.0809844633327,
            "min": 532.8668,
            "max": 631.0865,
            "range": 98.21969999999999
        }
    }
}