{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 7,
            "Incorrect cases": 22,
            "Average distance for correct cases": 1.1428571428571428,
            "Average distance for incorrect cases": 7.909090909090909,
            "Overall average distance": 6.275862068965517,
            "Normalized average distance for correct cases": 0.024844720496894408,
            "Normalized average distance for incorrect cases": 0.21196714521102059,
            "Normalized overall average distance": 0.1667996633834729,
            "Correct step number predictions": 9,
            "Incorrect step number predictions": 20,
            "Step number accuracy": 0.3103448275862069,
            "Step accuracy within +-1": 0.3793103448275862,
            "Step accuracy within +-2": 0.4482758620689655,
            "Step accuracy within +-3": 0.4827586206896552,
            "Step accuracy within +-4": 0.5862068965517241,
            "Step accuracy within +-5": 0.6551724137931034,
            "total_prompt_tokens": 296858,
            "total_output_tokens": 90869,
            "total_tokens": 387727,
            "total_execution_time_sec": 924.1762
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 8,
            "Incorrect cases": 21,
            "Average distance for correct cases": 1.5,
            "Average distance for incorrect cases": 5.9523809523809526,
            "Overall average distance": 4.724137931034483,
            "Normalized average distance for correct cases": 0.033126293995859216,
            "Normalized average distance for incorrect cases": 0.17258990578718653,
            "Normalized overall average distance": 0.13411718529302724,
            "Correct step number predictions": 9,
            "Incorrect step number predictions": 20,
            "Step number accuracy": 0.3103448275862069,
            "Step accuracy within +-1": 0.3448275862068966,
            "Step accuracy within +-2": 0.4482758620689655,
            "Step accuracy within +-3": 0.4827586206896552,
            "Step accuracy within +-4": 0.6206896551724138,
            "Step accuracy within +-5": 0.6896551724137931,
            "total_prompt_tokens": 296858,
            "total_output_tokens": 108189,
            "total_tokens": 405047,
            "total_execution_time_sec": 1158.5556
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 7,
            "Incorrect cases": 22,
            "Average distance for correct cases": 0.0,
            "Average distance for incorrect cases": 8.318181818181818,
            "Overall average distance": 6.310344827586207,
            "Normalized average distance for correct cases": 0.0,
            "Normalized average distance for incorrect cases": 0.22578575413832516,
            "Normalized overall average distance": 0.17128574451872944,
            "Correct step number predictions": 10,
            "Incorrect step number predictions": 19,
            "Step number accuracy": 0.3448275862068966,
            "Step accuracy within +-1": 0.3793103448275862,
            "Step accuracy within +-2": 0.4482758620689655,
            "Step accuracy within +-3": 0.4827586206896552,
            "Step accuracy within +-4": 0.5862068965517241,
            "Step accuracy within +-5": 0.6551724137931034,
            "total_prompt_tokens": 296858,
            "total_output_tokens": 104058,
            "total_tokens": 400916,
            "total_execution_time_sec": 1100.1437
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 22,
        "overall_incorrect_cases": 65,
        "overall_total_cases": 87,
        "overall_accuracy": 0.25287356321839083,
        "overall_avg_distance_for_correct_cases": 0.9090909090909091,
        "overall_avg_distance_for_incorrect_cases": 7.415384615384616,
        "overall_avg_distance": 5.7701149425287355,
        "overall_normalized_avg_distance_for_correct_cases": 0.0199510634293243,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.20392233549563885,
        "overall_normalized_avg_distance": 0.1574008643984099,
        "overall_correct_step_number_predictions": 28,
        "overall_incorrect_step_number_predictions": 59,
        "overall_step_number_accuracy": 0.3218390804597701,
        "overall_step_accuracy_within_+-1": 0.367816091954023,
        "overall_step_accuracy_within_+-2": 0.4482758620689655,
        "overall_step_accuracy_within_+-3": 0.4827586206896552,
        "overall_step_accuracy_within_+-4": 0.5977011494252873,
        "overall_step_accuracy_within_+-5": 0.6666666666666666,
        "grand_total_prompt_tokens": 890574,
        "grand_total_output_tokens": 303116,
        "grand_total_tokens": 1193690,
        "grand_total_execution_time_sec": 3182.8755,
        "avg_prompt_tokens_per_run": 296858.0,
        "avg_output_tokens_per_run": 101038.66666666667,
        "avg_tokens_per_run": 397896.6666666667,
        "avg_execution_time_per_run_sec": 1060.9585
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.25287356321839083,
            "std_dev": 0.01990862997205605,
            "variance": 0.0003963535473642486,
            "min": 0.2413793103448276,
            "max": 0.27586206896551724,
            "range": 0.03448275862068964,
            "coefficient_of_variation": 0.07872958216222165
        },
        "correct_cases": {
            "mean": 7.333333333333333,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 7,
            "max": 8,
            "range": 1
        },
        "incorrect_cases": {
            "mean": 21.666666666666668,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 21,
            "max": 22,
            "range": 1
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.8809523809523809,
            "std_dev": 0.7835467939002064,
            "variance": 0.6139455782312925,
            "min": 0.0,
            "max": 1.5,
            "range": 1.5
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 7.393217893217893,
            "std_dev": 1.264455282141473,
            "variance": 1.5988471605354722,
            "min": 5.9523809523809526,
            "max": 8.318181818181818,
            "range": 2.365800865800866
        },
        "overall_avg_distance": {
            "mean": 5.7701149425287355,
            "std_dev": 0.9060067309856805,
            "variance": 0.8208481965913592,
            "min": 4.724137931034483,
            "max": 6.310344827586207,
            "range": 1.5862068965517242
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.01932367149758454,
            "std_dev": 0.017239469974874806,
            "variance": 0.00029719932501460995,
            "min": 0.0,
            "max": 0.033126293995859216,
            "range": 0.033126293995859216
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.20344760171217743,
            "std_dev": 0.027602292233089,
            "variance": 0.0007618865365208454,
            "min": 0.17258990578718653,
            "max": 0.22578575413832516,
            "range": 0.053195848351138636
        },
        "normalized_overall_avg_distance": {
            "mean": 0.15740086439840986,
            "std_dev": 0.020288630202649205,
            "variance": 0.00041162851549984955,
            "min": 0.13411718529302724,
            "max": 0.17128574451872944,
            "range": 0.0371685592257022
        },
        "correct_step_number_predictions": {
            "mean": 9.333333333333334,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 9,
            "max": 10,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 19.666666666666668,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 19,
            "max": 20,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.3218390804597701,
            "std_dev": 0.019908629972056068,
            "variance": 0.0003963535473642492,
            "min": 0.3103448275862069,
            "max": 0.3448275862068966,
            "range": 0.03448275862068967
        },
        "step_accuracy_within_+-1": {
            "mean": 0.367816091954023,
            "std_dev": 0.019908629972056037,
            "variance": 0.00039635354736424795,
            "min": 0.3448275862068966,
            "max": 0.3793103448275862,
            "range": 0.03448275862068961
        },
        "step_accuracy_within_+-2": {
            "mean": 0.4482758620689655,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.4482758620689655,
            "max": 0.4482758620689655,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 0.4827586206896552,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.4827586206896552,
            "max": 0.4827586206896552,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 0.5977011494252873,
            "std_dev": 0.0199086299720561,
            "variance": 0.0003963535473642505,
            "min": 0.5862068965517241,
            "max": 0.6206896551724138,
            "range": 0.034482758620689724
        },
        "step_accuracy_within_+-5": {
            "mean": 0.6666666666666666,
            "std_dev": 0.0199086299720561,
            "variance": 0.0003963535473642505,
            "min": 0.6551724137931034,
            "max": 0.6896551724137931,
            "range": 0.034482758620689724
        },
        "total_prompt_tokens": {
            "mean": 296858,
            "std_dev": 0.0,
            "variance": 0,
            "min": 296858,
            "max": 296858,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 101038.66666666667,
            "std_dev": 9046.152791841034,
            "variance": 81832880.33333333,
            "min": 90869,
            "max": 108189,
            "range": 17320
        },
        "total_tokens": {
            "mean": 397896.6666666667,
            "std_dev": 9046.152791841034,
            "variance": 81832880.33333333,
            "min": 387727,
            "max": 405047,
            "range": 17320
        },
        "total_execution_time_sec": {
            "mean": 1060.9585,
            "std_dev": 122.00424464079107,
            "variance": 14885.035710369995,
            "min": 924.1762,
            "max": 1158.5556,
            "range": 234.37939999999992
        }
    }
}