{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.8888888888888888,
            "Average distance for incorrect cases": 5.45,
            "Overall average distance": 4.0344827586206895,
            "Normalized average distance for correct cases": 0.02242926155969634,
            "Normalized average distance for incorrect cases": 0.160804304980869,
            "Normalized overall average distance": 0.11786032598809128,
            "Correct step number predictions": 11,
            "Incorrect step number predictions": 18,
            "Step number accuracy": 0.3793103448275862,
            "Step accuracy within +-1": 0.4482758620689655,
            "Step accuracy within +-2": 0.5172413793103449,
            "Step accuracy within +-3": 0.5862068965517241,
            "Step accuracy within +-4": 0.7241379310344828,
            "Step accuracy within +-5": 0.7931034482758621,
            "total_prompt_tokens": 454135,
            "total_output_tokens": 30515,
            "total_tokens": 484650,
            "total_execution_time_sec": 1173.2132
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.8888888888888888,
            "Average distance for incorrect cases": 5.55,
            "Overall average distance": 4.103448275862069,
            "Normalized average distance for correct cases": 0.02242926155969634,
            "Normalized average distance for incorrect cases": 0.1655674146781892,
            "Normalized overall average distance": 0.12114522922762246,
            "Correct step number predictions": 10,
            "Incorrect step number predictions": 19,
            "Step number accuracy": 0.3448275862068966,
            "Step accuracy within +-1": 0.3793103448275862,
            "Step accuracy within +-2": 0.4827586206896552,
            "Step accuracy within +-3": 0.5517241379310345,
            "Step accuracy within +-4": 0.7241379310344828,
            "Step accuracy within +-5": 0.7931034482758621,
            "total_prompt_tokens": 454135,
            "total_output_tokens": 29741,
            "total_tokens": 483876,
            "total_execution_time_sec": 935.8219
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 19,
            "Average distance for correct cases": 1.5555555555555556,
            "Average distance for incorrect cases": 5.5,
            "Overall average distance": 4.275862068965517,
            "Normalized average distance for correct cases": 0.04134460547504026,
            "Normalized average distance for incorrect cases": 0.174965308451083,
            "Normalized overall average distance": 0.13349681442403527,
            "Correct step number predictions": 11,
            "Incorrect step number predictions": 18,
            "Step number accuracy": 0.3793103448275862,
            "Step accuracy within +-1": 0.3793103448275862,
            "Step accuracy within +-2": 0.5172413793103449,
            "Step accuracy within +-3": 0.5862068965517241,
            "Step accuracy within +-4": 0.7241379310344828,
            "Step accuracy within +-5": 0.7931034482758621,
            "total_prompt_tokens": 454135,
            "total_output_tokens": 26353,
            "total_tokens": 480488,
            "total_execution_time_sec": 923.4649
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 30,
        "overall_incorrect_cases": 57,
        "overall_total_cases": 87,
        "overall_accuracy": 0.3448275862068966,
        "overall_avg_distance_for_correct_cases": 1.1111111111111112,
        "overall_avg_distance_for_incorrect_cases": 5.5,
        "overall_avg_distance": 4.137931034482759,
        "overall_normalized_avg_distance_for_correct_cases": 0.028734376198144312,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.16711234270338038,
        "overall_normalized_avg_distance": 0.124167456546583,
        "overall_correct_step_number_predictions": 32,
        "overall_incorrect_step_number_predictions": 55,
        "overall_step_number_accuracy": 0.367816091954023,
        "overall_step_accuracy_within_+-1": 0.367816091954023,
        "overall_step_accuracy_within_+-2": 0.5057471264367817,
        "overall_step_accuracy_within_+-3": 0.5747126436781609,
        "overall_step_accuracy_within_+-4": 0.7241379310344828,
        "overall_step_accuracy_within_+-5": 0.7931034482758621,
        "grand_total_prompt_tokens": 1362405,
        "grand_total_output_tokens": 86609,
        "grand_total_tokens": 1449014,
        "grand_total_execution_time_sec": 3032.5,
        "avg_prompt_tokens_per_run": 454135.0,
        "avg_output_tokens_per_run": 28869.666666666668,
        "avg_tokens_per_run": 483004.6666666667,
        "avg_execution_time_per_run_sec": 1010.8333
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.3448275862068966,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.3448275862068966,
            "max": 0.3448275862068966,
            "range": 0.0,
            "coefficient_of_variation": 0.0
        },
        "correct_cases": {
            "mean": 10,
            "std_dev": 0.0,
            "variance": 0,
            "min": 10,
            "max": 10,
            "range": 0
        },
        "incorrect_cases": {
            "mean": 19,
            "std_dev": 0.0,
            "variance": 0,
            "min": 19,
            "max": 19,
            "range": 0
        },
        "avg_distance_for_correct_cases": {
            "mean": 1.1111111111111112,
            "std_dev": 0.3849001794597506,
            "variance": 0.14814814814814817,
            "min": 0.8888888888888888,
            "max": 1.5555555555555556,
            "range": 0.6666666666666667
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 5.5,
            "std_dev": 0.04999999999999982,
            "variance": 0.0024999999999999823,
            "min": 5.45,
            "max": 5.55,
            "range": 0.09999999999999964
        },
        "overall_avg_distance": {
            "mean": 4.137931034482759,
            "std_dev": 0.12432935432634427,
            "variance": 0.015457788347205662,
            "min": 4.0344827586206895,
            "max": 4.275862068965517,
            "range": 0.2413793103448274
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.028734376198144312,
            "std_dev": 0.01092077890133816,
            "variance": 0.00011926341181191271,
            "min": 0.02242926155969634,
            "max": 0.04134460547504026,
            "range": 0.018915343915343916
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.1671123427033804,
            "std_dev": 0.007205803686829042,
            "variance": 5.192360677311902e-05,
            "min": 0.160804304980869,
            "max": 0.174965308451083,
            "range": 0.014161003470213984
        },
        "normalized_overall_avg_distance": {
            "mean": 0.124167456546583,
            "std_dev": 0.008244715648671951,
            "variance": 6.797533612745614e-05,
            "min": 0.11786032598809128,
            "max": 0.13349681442403527,
            "range": 0.015636488435943993
        },
        "correct_step_number_predictions": {
            "mean": 11,
            "std_dev": 1.0,
            "variance": 1.0,
            "min": 10,
            "max": 11,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 18,
            "std_dev": 1.0,
            "variance": 1,
            "min": 18,
            "max": 19,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.367816091954023,
            "std_dev": 0.016255328303139,
            "variance": 0.00026423569824283,
            "min": 0.3448275862068966,
            "max": 0.3793103448275862,
            "range": 0.03448275862068961
        },
        "step_accuracy_within_+-1": {
            "mean": 0.4022988505747106,
            "std_dev": 0.032510656606278,
            "variance": 0.0010568888888888888,
            "min": 0.3793103448275862,
            "max": 0.4482758620689655,
            "range": 0.06896551724137928
        },
        "step_accuracy_within_+-2": {
            "mean": 0.5057471264367817,
            "std_dev": 0.019908629972056068,
            "variance": 0.0003963535473642492,
            "min": 0.4827586206896552,
            "max": 0.5172413793103449,
            "range": 0.03448275862068967
        },
        "step_accuracy_within_+-3": {
            "mean": 0.5747126436781609,
            "std_dev": 0.019908629972056037,
            "variance": 0.00039635354736424795,
            "min": 0.5517241379310345,
            "max": 0.5862068965517241,
            "range": 0.03448275862068961
        },
        "step_accuracy_within_+-4": {
            "mean": 0.7241379310344828,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.7241379310344828,
            "max": 0.7241379310344828,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 0.7931034482758621,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.7931034482758621,
            "max": 0.7931034482758621,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 454135,
            "std_dev": 0.0,
            "variance": 0,
            "min": 454135,
            "max": 454135,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 28869.666666666668,
            "std_dev": 2213.589242233828,
            "variance": 4899977.333333333,
            "min": 26353,
            "max": 30515,
            "range": 4162
        },
        "total_tokens": {
            "mean": 483004.6666666667,
            "std_dev": 2213.589242233828,
            "variance": 4899977.333333333,
            "min": 480488,
            "max": 484650,
            "range": 4162
        },
        "total_execution_time_sec": {
            "mean": 1010.8333333333333,
            "std_dev": 140.76075335924898,
            "variance": 19813.589686263324,
            "min": 923.4649,
            "max": 1173.2132,
            "range": 249.74829999999997
        }
    }
}