{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 24,
            "Incorrect cases": 18,
            "Average distance for correct cases": 0.4583333333333333,
            "Average distance for incorrect cases": 0.5555555555555556,
            "Overall average distance": 0.5,
            "Normalized average distance for correct cases": 0.014012376512376511,
            "Normalized average distance for incorrect cases": 0.021781245851586348,
            "Normalized overall average distance": 0.017341891943466442,
            "Correct step number predictions": 26,
            "Incorrect step number predictions": 16,
            "Step number accuracy": 0.6190476190476191,
            "Step accuracy within +-1": 0.9047619047619048,
            "Step accuracy within +-2": 0.9761904761904762,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 638520,
            "total_output_tokens": 70993,
            "total_tokens": 709513,
            "total_execution_time_sec": 680.474
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 26,
            "Incorrect cases": 16,
            "Average distance for correct cases": 0.38461538461538464,
            "Average distance for incorrect cases": 0.375,
            "Overall average distance": 0.38095238095238093,
            "Normalized average distance for correct cases": 0.011613021949560412,
            "Normalized average distance for incorrect cases": 0.015674603174603174,
            "Normalized overall average distance": 0.013160290987671939,
            "Correct step number predictions": 30,
            "Incorrect step number predictions": 12,
            "Step number accuracy": 0.7142857142857143,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 0.9761904761904762,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 638520,
            "total_output_tokens": 66737,
            "total_tokens": 705257,
            "total_execution_time_sec": 805.7928
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 25,
            "Incorrect cases": 17,
            "Average distance for correct cases": 0.36,
            "Average distance for incorrect cases": 0.4117647058823529,
            "Overall average distance": 0.38095238095238093,
            "Normalized average distance for correct cases": 0.011146325896325898,
            "Normalized average distance for incorrect cases": 0.01721132897603486,
            "Normalized overall average distance": 0.013601208095255715,
            "Correct step number predictions": 30,
            "Incorrect step number predictions": 12,
            "Step number accuracy": 0.7142857142857143,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 0.9761904761904762,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 638520,
            "total_output_tokens": 73813,
            "total_tokens": 712333,
            "total_execution_time_sec": 698.7396
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 75,
        "overall_incorrect_cases": 51,
        "overall_total_cases": 126,
        "overall_accuracy": 0.5952380952380952,
        "overall_avg_distance_for_correct_cases": 0.4,
        "overall_avg_distance_for_incorrect_cases": 0.45098039215686275,
        "overall_avg_distance": 0.42063492063492064,
        "overall_normalized_avg_distance_for_correct_cases": 0.012225250058583392,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.01834213075911368,
        "overall_normalized_avg_distance": 0.014701130342131366,
        "overall_correct_step_number_predictions": 86,
        "overall_incorrect_step_number_predictions": 40,
        "overall_step_number_accuracy": 0.6825396825396826,
        "overall_step_accuracy_within_+-1": 0.9206349206349207,
        "overall_step_accuracy_within_+-2": 0.9761904761904762,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 1915560,
        "grand_total_output_tokens": 211543,
        "grand_total_tokens": 2127103,
        "grand_total_execution_time_sec": 2185.0064,
        "avg_prompt_tokens_per_run": 638520.0,
        "avg_output_tokens_per_run": 70514.33333333333,
        "avg_tokens_per_run": 709034.3333333334,
        "avg_execution_time_per_run_sec": 728.3355
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.5952380952380952,
            "std_dev": 0.023809523809523836,
            "variance": 0.0005668934240362824,
            "min": 0.5714285714285714,
            "max": 0.6190476190476191,
            "range": 0.04761904761904767,
            "coefficient_of_variation": 0.04000000000000004
        },
        "correct_cases": {
            "mean": 25,
            "std_dev": 1.0,
            "variance": 1,
            "min": 24,
            "max": 26,
            "range": 2
        },
        "incorrect_cases": {
            "mean": 17,
            "std_dev": 1.0,
            "variance": 1,
            "min": 16,
            "max": 18,
            "range": 2
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.400982905982906,
            "std_dev": 0.0511691599295739,
            "variance": 0.0026182829278983116,
            "min": 0.36,
            "max": 0.4583333333333333,
            "range": 0.09833333333333333
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.4474400871459695,
            "std_dev": 0.0954181679855683,
            "variance": 0.009104626781722133,
            "min": 0.375,
            "max": 0.5555555555555556,
            "range": 0.18055555555555558
        },
        "overall_avg_distance": {
            "mean": 0.42063492063492064,
            "std_dev": 0.06873217490352689,
            "variance": 0.004724111866969012,
            "min": 0.38095238095238093,
            "max": 0.5,
            "range": 0.11904761904761907
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.012257241452754274,
            "std_dev": 0.0015377989496883312,
            "variance": 2.364825609662534e-06,
            "min": 0.011146325896325898,
            "max": 0.014012376512376511,
            "range": 0.0028660506160506135
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.018222392667408126,
            "std_dev": 0.003176390803584795,
            "variance": 1.008945853709806e-05,
            "min": 0.015674603174603174,
            "max": 0.021781245851586348,
            "range": 0.006106642676983173
        },
        "normalized_overall_avg_distance": {
            "mean": 0.014701130342131365,
            "std_dev": 0.0022975679207053617,
            "variance": 5.27881835025436e-06,
            "min": 0.013160290987671939,
            "max": 0.017341891943466442,
            "range": 0.004181600955794503
        },
        "correct_step_number_predictions": {
            "mean": 28.666666666666668,
            "std_dev": 2.309401076758503,
            "variance": 5.333333333333333,
            "min": 26,
            "max": 30,
            "range": 4
        },
        "incorrect_step_number_predictions": {
            "mean": 13.333333333333334,
            "std_dev": 2.309401076758503,
            "variance": 5.333333333333333,
            "min": 12,
            "max": 16,
            "range": 4
        },
        "step_number_accuracy": {
            "mean": 0.6825396825396826,
            "std_dev": 0.0549857399228215,
            "variance": 0.003023431594860166,
            "min": 0.6190476190476191,
            "max": 0.7142857142857143,
            "range": 0.09523809523809523
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9206349206349207,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.9047619047619048,
            "max": 0.9285714285714286,
            "range": 0.023809523809523836
        },
        "step_accuracy_within_+-2": {
            "mean": 0.9761904761904762,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.9761904761904762,
            "max": 0.9761904761904762,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 638520,
            "std_dev": 0.0,
            "variance": 0,
            "min": 638520,
            "max": 638520,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 70514.33333333333,
            "std_dev": 3562.2023150480004,
            "variance": 12689285.333333334,
            "min": 66737,
            "max": 73813,
            "range": 7076
        },
        "total_tokens": {
            "mean": 709034.3333333334,
            "std_dev": 3562.2023150480004,
            "variance": 12689285.333333334,
            "min": 705257,
            "max": 712333,
            "range": 7076
        },
        "total_execution_time_sec": {
            "mean": 728.3354666666667,
            "std_dev": 67.69886927544164,
            "variance": 4583.136901173336,
            "min": 680.474,
            "max": 805.7928,
            "range": 125.31880000000001
        }
    }
}