{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 7,
            "Incorrect cases": 22,
            "Average distance for correct cases": 0.5714285714285714,
            "Average distance for incorrect cases": 5.7272727272727275,
            "Overall average distance": 4.482758620689655,
            "Normalized average distance for correct cases": 0.019047619047619046,
            "Normalized average distance for incorrect cases": 0.1636700791034518,
            "Normalized overall average distance": 0.12876120943480251,
            "Correct step number predictions": 10,
            "Incorrect step number predictions": 19,
            "Step number accuracy": 0.3448275862068966,
            "Step accuracy within +-1": 0.3793103448275862,
            "Step accuracy within +-2": 0.4482758620689655,
            "Step accuracy within +-3": 0.5172413793103449,
            "Step accuracy within +-4": 0.6206896551724138,
            "Step accuracy within +-5": 0.6896551724137931,
            "total_prompt_tokens": 276645,
            "total_output_tokens": 98893,
            "total_tokens": 375538,
            "total_execution_time_sec": 1025.8826
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 7,
            "Incorrect cases": 22,
            "Average distance for correct cases": 1.1428571428571428,
            "Average distance for incorrect cases": 8.0,
            "Overall average distance": 6.344827586206897,
            "Normalized average distance for correct cases": 0.03537414965986395,
            "Normalized average distance for incorrect cases": 0.2185879081246897,
            "Normalized overall average distance": 0.17436389746076625,
            "Correct step number predictions": 8,
            "Incorrect step number predictions": 21,
            "Step number accuracy": 0.27586206896551724,
            "Step accuracy within +-1": 0.3448275862068966,
            "Step accuracy within +-2": 0.4482758620689655,
            "Step accuracy within +-3": 0.4827586206896552,
            "Step accuracy within +-4": 0.5862068965517241,
            "Step accuracy within +-5": 0.6551724137931034,
            "total_prompt_tokens": 276645,
            "total_output_tokens": 103133,
            "total_tokens": 379778,
            "total_execution_time_sec": 1040.1155
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 8,
            "Incorrect cases": 21,
            "Average distance for correct cases": 0.6666666666666666,
            "Average distance for incorrect cases": 8.65,
            "Overall average distance": 6.172413793103448,
            "Normalized average distance for correct cases": 0.022222222222222223,
            "Normalized average distance for incorrect cases": 0.23618945256034704,
            "Normalized overall average distance": 0.16978582935196349,
            "Correct step number predictions": 10,
            "Incorrect step number predictions": 19,
            "Step number accuracy": 0.3448275862068966,
            "Step accuracy within +-1": 0.3793103448275862,
            "Step accuracy within +-2": 0.4482758620689655,
            "Step accuracy within +-3": 0.5172413793103449,
            "Step accuracy within +-4": 0.5862068965517241,
            "Step accuracy within +-5": 0.6551724137931034,
            "total_prompt_tokens": 276645,
            "total_output_tokens": 97886,
            "total_tokens": 374531,
            "total_execution_time_sec": 1037.3391
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 22,
        "overall_incorrect_cases": 65,
        "overall_total_cases": 87,
        "overall_accuracy": 0.2528735632183908,
        "overall_avg_distance_for_correct_cases": 0.782608695652174,
        "overall_avg_distance_for_incorrect_cases": 7.421875,
        "overall_avg_distance": 5.666666666666667,
        "overall_normalized_avg_distance_for_correct_cases": 0.025258799171842653,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.2052103870347821,
        "overall_normalized_avg_distance": 0.1576369787491774,
        "overall_correct_step_number_predictions": 28,
        "overall_incorrect_step_number_predictions": 59,
        "overall_step_number_accuracy": 0.3218390804597701,
        "overall_step_accuracy_within_+-1": 0.367816091954023,
        "overall_step_accuracy_within_+-2": 0.4482758620689655,
        "overall_step_accuracy_within_+-3": 0.5057471264367817,
        "overall_step_accuracy_within_+-4": 0.5977011494252873,
        "overall_step_accuracy_within_+-5": 0.6666666666666666,
        "grand_total_prompt_tokens": 829935,
        "grand_total_output_tokens": 299912,
        "grand_total_tokens": 1129847,
        "grand_total_execution_time_sec": 3103.3372,
        "avg_prompt_tokens_per_run": 276645.0,
        "avg_output_tokens_per_run": 99970.66666666667,
        "avg_tokens_per_run": 376615.6666666667,
        "avg_execution_time_per_run_sec": 1034.4457
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.26436781609195403,
            "std_dev": 0.03981725994411212,
            "variance": 0.0015854141894569955,
            "min": 0.2413793103448276,
            "max": 0.3103448275862069,
            "range": 0.06896551724137931,
            "coefficient_of_variation": 0.1506131137016415
        },
        "correct_cases": {
            "mean": 7.666666666666667,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 7,
            "max": 9,
            "range": 2
        },
        "incorrect_cases": {
            "mean": 21.333333333333332,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 20,
            "max": 22,
            "range": 2
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.7936507936507936,
            "std_dev": 0.30614764320612553,
            "variance": 0.09372637944066514,
            "min": 0.5714285714285714,
            "max": 1.1428571428571428,
            "range": 0.5714285714285714
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 7.459090909090909,
            "std_dev": 1.5346076570762395,
            "variance": 2.355020661157025,
            "min": 5.7272727272727275,
            "max": 8.65,
            "range": 2.922727272727273
        },
        "overall_avg_distance": {
            "mean": 5.666666666666667,
            "std_dev": 1.0289122047142358,
            "variance": 1.0586603250099096,
            "min": 4.482758620689655,
            "max": 6.344827586206897,
            "range": 1.862068965517242
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.025547996976568406,
            "std_dev": 0.00865647062870471,
            "variance": 7.49344837456273e-05,
            "min": 0.019047619047619046,
            "max": 0.03537414965986395,
            "range": 0.016326530612244903
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.20614914659616285,
            "std_dev": 0.037826009212152296,
            "variance": 0.0014308069729178302,
            "min": 0.1636700791034518,
            "max": 0.23618945256034704,
            "range": 0.07251937345689524
        },
        "normalized_overall_avg_distance": {
            "mean": 0.15763697874917743,
            "std_dev": 0.02511169482581683,
            "variance": 0.0006305972170249556,
            "min": 0.12876120943480251,
            "max": 0.17436389746076625,
            "range": 0.045602688025963733
        },
        "correct_step_number_predictions": {
            "mean": 9.333333333333334,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 8,
            "max": 10,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 19.666666666666668,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 19,
            "max": 21,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.3218390804597701,
            "std_dev": 0.039817259944112136,
            "variance": 0.0015854141894569968,
            "min": 0.27586206896551724,
            "max": 0.3448275862068966,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-1": {
            "mean": 0.367816091954023,
            "std_dev": 0.019908629972056037,
            "variance": 0.00039635354736424795,
            "min": 0.3448275862068966,
            "max": 0.3793103448275862,
            "range": 0.03448275862068961
        },
        "step_accuracy_within_+-2": {
            "mean": 0.4482758620689655,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.4482758620689655,
            "max": 0.4482758620689655,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 0.5057471264367817,
            "std_dev": 0.019908629972056068,
            "variance": 0.0003963535473642492,
            "min": 0.4827586206896552,
            "max": 0.5172413793103449,
            "range": 0.03448275862068967
        },
        "step_accuracy_within_+-4": {
            "mean": 0.5977011494252873,
            "std_dev": 0.0199086299720561,
            "variance": 0.0003963535473642505,
            "min": 0.5862068965517241,
            "max": 0.6206896551724138,
            "range": 0.034482758620689724
        },
        "step_accuracy_within_+-5": {
            "mean": 0.6666666666666666,
            "std_dev": 0.0199086299720561,
            "variance": 0.0003963535473642505,
            "min": 0.6551724137931034,
            "max": 0.6896551724137931,
            "range": 0.034482758620689724
        },
        "total_prompt_tokens": {
            "mean": 276645,
            "std_dev": 0.0,
            "variance": 0,
            "min": 276645,
            "max": 276645,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 99970.66666666667,
            "std_dev": 2784.560348301565,
            "variance": 7753776.333333333,
            "min": 97886,
            "max": 103133,
            "range": 5247
        },
        "total_tokens": {
            "mean": 376615.6666666667,
            "std_dev": 2784.560348301565,
            "variance": 7753776.333333333,
            "min": 374531,
            "max": 379778,
            "range": 5247
        },
        "total_execution_time_sec": {
            "mean": 1034.4457333333332,
            "std_dev": 7.544702684886561,
            "variance": 56.92253860333448,
            "min": 1025.8826,
            "max": 1040.1155,
            "range": 14.2329000000002
        }
    }
}