{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 12,
            "Incorrect cases": 17,
            "Average distance for correct cases": 2.5454545454545454,
            "Average distance for incorrect cases": 3.888888888888889,
            "Overall average distance": 3.3793103448275863,
            "Normalized average distance for correct cases": 0.04551222503536669,
            "Normalized average distance for incorrect cases": 0.11010972052638719,
            "Normalized overall average distance": 0.08560722223668976,
            "Correct step number predictions": 14,
            "Incorrect step number predictions": 15,
            "Step number accuracy": 0.4827586206896552,
            "Step accuracy within +-1": 0.4827586206896552,
            "Step accuracy within +-2": 0.5517241379310345,
            "Step accuracy within +-3": 0.6206896551724138,
            "Step accuracy within +-4": 0.6896551724137931,
            "Step accuracy within +-5": 0.7586206896551724,
            "total_prompt_tokens": 347567,
            "total_output_tokens": 75116,
            "total_tokens": 422683,
            "total_execution_time_sec": 619.9971
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 11,
            "Incorrect cases": 18,
            "Average distance for correct cases": 2.5454545454545454,
            "Average distance for incorrect cases": 3.5,
            "Overall average distance": 3.1379310344827585,
            "Normalized average distance for correct cases": 0.04551222503536669,
            "Normalized average distance for incorrect cases": 0.09994890181293689,
            "Normalized overall average distance": 0.07930050717316889,
            "Correct step number predictions": 14,
            "Incorrect step number predictions": 15,
            "Step number accuracy": 0.4827586206896552,
            "Step accuracy within +-1": 0.5172413793103449,
            "Step accuracy within +-2": 0.5862068965517241,
            "Step accuracy within +-3": 0.6551724137931034,
            "Step accuracy within +-4": 0.7241379310344828,
            "Step accuracy within +-5": 0.7931034482758621,
            "total_prompt_tokens": 347567,
            "total_output_tokens": 74062,
            "total_tokens": 421629,
            "total_execution_time_sec": 573.6203
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 11,
            "Incorrect cases": 18,
            "Average distance for correct cases": 1.7272727272727273,
            "Average distance for incorrect cases": 3.111111111111111,
            "Overall average distance": 2.586206896551724,
            "Normalized average distance for correct cases": 0.03647891963109354,
            "Normalized average distance for incorrect cases": 0.09323361823361824,
            "Normalized overall average distance": 0.07170597393610888,
            "Correct step number predictions": 14,
            "Incorrect step number predictions": 15,
            "Step number accuracy": 0.4827586206896552,
            "Step accuracy within +-1": 0.5172413793103449,
            "Step accuracy within +-2": 0.5862068965517241,
            "Step accuracy within +-3": 0.6896551724137931,
            "Step accuracy within +-4": 0.7586206896551724,
            "Step accuracy within +-5": 0.7931034482758621,
            "total_prompt_tokens": 347567,
            "total_output_tokens": 68378,
            "total_tokens": 415945,
            "total_execution_time_sec": 538.3278
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 34,
        "overall_incorrect_cases": 53,
        "overall_total_cases": 87,
        "overall_accuracy": 0.39080459770114943,
        "overall_avg_distance_for_correct_cases": 2.272727272727273,
        "overall_avg_distance_for_incorrect_cases": 3.5,
        "overall_avg_distance": 3.0344827586206895,
        "overall_normalized_avg_distance_for_correct_cases": 0.04250112323394231,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.1010974135243141,
        "overall_normalized_avg_distance": 0.07887123444865585,
        "overall_correct_step_number_predictions": 42,
        "overall_incorrect_step_number_predictions": 45,
        "overall_step_number_accuracy": 0.4827586206896552,
        "overall_step_accuracy_within_+-1": 0.5057471264367817,
        "overall_step_accuracy_within_+-2": 0.5747126436781609,
        "overall_step_accuracy_within_+-3": 0.6551724137931034,
        "overall_step_accuracy_within_+-4": 0.7241379310344828,
        "overall_step_accuracy_within_+-5": 0.7816091954022989,
        "grand_total_prompt_tokens": 1042701,
        "grand_total_output_tokens": 217556,
        "grand_total_tokens": 1260257,
        "grand_total_execution_time_sec": 1731.9452,
        "avg_prompt_tokens_per_run": 347567.0,
        "avg_output_tokens_per_run": 72518.66666666667,
        "avg_tokens_per_run": 420085.6666666667,
        "avg_execution_time_per_run_sec": 577.3151
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.39080459770114943,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.3793103448275862,
            "max": 0.3793103448275862,
            "range": 0.0,
            "coefficient_of_variation": 0.0
        },
        "correct_cases": {
            "mean": 11,
            "std_dev": 0.0,
            "variance": 0,
            "min": 11,
            "max": 11,
            "range": 0
        },
        "incorrect_cases": {
            "mean": 18,
            "std_dev": 0.0,
            "variance": 0,
            "min": 18,
            "max": 18,
            "range": 0
        },
        "avg_distance_for_correct_cases": {
            "mean": 2.2727272727272725,
            "std_dev": 0.47237749297333015,
            "variance": 0.22314049586776857,
            "min": 1.7272727272727273,
            "max": 2.5454545454545454,
            "range": 0.8181818181818181
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 3.5,
            "std_dev": 0.38888888888888884,
            "variance": 0.15123456790123452,
            "min": 3.111111111111111,
            "max": 3.888888888888889,
            "range": 0.7777777777777777
        },
        "overall_avg_distance": {
            "mean": 3.0344827586206895,
            "std_dev": 0.4065457283638481,
            "variance": 0.16527942925089179,
            "min": 2.586206896551724,
            "max": 3.3793103448275863,
            "range": 0.7931034482758621
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.042501123233942305,
            "std_dev": 0.005215381306829207,
            "variance": 2.720020217562353e-05,
            "min": 0.03647891963109354,
            "max": 0.04551222503536669,
            "range": 0.009033305404273155
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.1010974135243141,
            "std_dev": 0.008496470826900917,
            "variance": 7.219001651237834e-05,
            "min": 0.09323361823361824,
            "max": 0.11010972052638719,
            "range": 0.016876102292768952
        },
        "normalized_overall_avg_distance": {
            "mean": 0.07887123444865585,
            "std_dev": 0.006960559056757501,
            "variance": 4.844938238260887e-05,
            "min": 0.07170597393610888,
            "max": 0.08560722223668976,
            "range": 0.013901248300580885
        },
        "correct_step_number_predictions": {
            "mean": 13.666666666666666,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 13,
            "max": 14,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 15.333333333333334,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 15,
            "max": 16,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.4827586206896552,
            "std_dev": 0.019908629972056068,
            "variance": 0.0,
            "min": 0.4827586206896552,
            "max": 0.4827586206896552,
            "range": 0.0
        },
        "step_accuracy_within_+-1": {
            "mean": 0.5057471264367817,
            "std_dev": 0.019908629972056068,
            "variance": 0.0003963535473642492,
            "min": 0.4827586206896552,
            "max": 0.5172413793103449,
            "range": 0.03448275862068967
        },
        "step_accuracy_within_+-2": {
            "mean": 0.5747126436781609,
            "std_dev": 0.019908629972056037,
            "variance": 0.00039635354736424795,
            "min": 0.5517241379310345,
            "max": 0.5862068965517241,
            "range": 0.03448275862068961
        },
        "step_accuracy_within_+-3": {
            "mean": 0.6551724137931034,
            "std_dev": 0.03448275862068967,
            "variance": 0.0011890606420927477,
            "min": 0.6206896551724138,
            "max": 0.6896551724137931,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-4": {
            "mean": 0.7241379310344828,
            "std_dev": 0.03448275862068961,
            "variance": 0.0011890606420927438,
            "min": 0.6896551724137931,
            "max": 0.7586206896551724,
            "range": 0.06896551724137923
        },
        "step_accuracy_within_+-5": {
            "mean": 0.7816091954022989,
            "std_dev": 0.0199086299720561,
            "variance": 0.0003963535473642505,
            "min": 0.7586206896551724,
            "max": 0.7931034482758621,
            "range": 0.034482758620689724
        },
        "total_prompt_tokens": {
            "mean": 347567,
            "std_dev": 0.0,
            "variance": 0,
            "min": 347567,
            "max": 347567,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 72518.66666666667,
            "std_dev": 3624.4405545316004,
            "variance": 13136569.333333334,
            "min": 68378,
            "max": 75116,
            "range": 6738
        },
        "total_tokens": {
            "mean": 420085.6666666667,
            "std_dev": 3624.4405545316004,
            "variance": 13136569.333333334,
            "min": 415945,
            "max": 422683,
            "range": 6738
        },
        "total_execution_time_sec": {
            "mean": 577.3150666666667,
            "std_dev": 40.959823194971605,
            "variance": 1677.7071161633341,
            "min": 538.3278,
            "max": 619.9971,
            "range": 81.66930000000002
        }
    }
}