{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 19,
            "Average distance for correct cases": 3.0,
            "Average distance for incorrect cases": 4.0,
            "Overall average distance": 3.689655172413793,
            "Normalized average distance for correct cases": 0.07474407637451116,
            "Normalized average distance for incorrect cases": 0.11117048485362782,
            "Normalized overall average distance": 0.09986573739459162,
            "Correct step number predictions": 11,
            "Incorrect step number predictions": 18,
            "Step number accuracy": 0.3793103448275862,
            "Step accuracy within +-1": 0.3793103448275862,
            "Step accuracy within +-2": 0.4482758620689655,
            "Step accuracy within +-3": 0.5172413793103449,
            "Step accuracy within +-4": 0.7241379310344828,
            "Step accuracy within +-5": 0.7931034482758621,
            "total_prompt_tokens": 381050,
            "total_output_tokens": 69983,
            "total_tokens": 451033,
            "total_execution_time_sec": 2386.8897
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 19,
            "Average distance for correct cases": 1.5,
            "Average distance for incorrect cases": 4.333333333333333,
            "Overall average distance": 3.5517241379310347,
            "Normalized average distance for correct cases": 0.041304347826086954,
            "Normalized average distance for incorrect cases": 0.11877060478743437,
            "Normalized overall average distance": 0.09740060286706267,
            "Correct step number predictions": 12,
            "Incorrect step number predictions": 17,
            "Step number accuracy": 0.41379310344827586,
            "Step accuracy within +-1": 0.41379310344827586,
            "Step accuracy within +-2": 0.5172413793103449,
            "Step accuracy within +-3": 0.5862068965517241,
            "Step accuracy within +-4": 0.6896551724137931,
            "Step accuracy within +-5": 0.7586206896551724,
            "total_prompt_tokens": 381050,
            "total_output_tokens": 75775,
            "total_tokens": 456825,
            "total_execution_time_sec": 2635.6971
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 11,
            "Incorrect cases": 18,
            "Average distance for correct cases": 0.8571428571428571,
            "Average distance for incorrect cases": 4.318181818181818,
            "Overall average distance": 3.4827586206896552,
            "Normalized average distance for correct cases": 0.018633540372670808,
            "Normalized average distance for incorrect cases": 0.12123261474240977,
            "Normalized overall average distance": 0.0964673209290245,
            "Correct step number predictions": 10,
            "Incorrect step number predictions": 19,
            "Step number accuracy": 0.3448275862068966,
            "Step accuracy within +-1": 0.41379310344827586,
            "Step accuracy within +-2": 0.4827586206896552,
            "Step accuracy within +-3": 0.5517241379310345,
            "Step accuracy within +-4": 0.7586206896551724,
            "Step accuracy within +-5": 0.8275862068965517,
            "total_prompt_tokens": 381050,
            "total_output_tokens": 69533,
            "total_tokens": 450583,
            "total_execution_time_sec": 2158.0741
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 31,
        "overall_incorrect_cases": 56,
        "overall_total_cases": 87,
        "overall_accuracy": 0.3563218390804598,
        "overall_avg_distance_for_correct_cases": 1.875,
        "overall_avg_distance_for_incorrect_cases": 4.222222222222222,
        "overall_avg_distance": 3.574712643678161,
        "overall_normalized_avg_distance_for_correct_cases": 0.04723192719116632,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.11721761780859831,
        "overall_normalized_avg_distance": 0.09791122039689293,
        "overall_correct_step_number_predictions": 33,
        "overall_incorrect_step_number_predictions": 54,
        "overall_step_number_accuracy": 0.3793103448275862,
        "overall_step_accuracy_within_+-1": 0.40229885057471265,
        "overall_step_accuracy_within_+-2": 0.4827586206896552,
        "overall_step_accuracy_within_+-3": 0.5517241379310345,
        "overall_step_accuracy_within_+-4": 0.7241379310344828,
        "overall_step_accuracy_within_+-5": 0.7931034482758621,
        "grand_total_prompt_tokens": 1143150,
        "grand_total_output_tokens": 215291,
        "grand_total_tokens": 1358441,
        "grand_total_execution_time_sec": 7180.6609,
        "avg_prompt_tokens_per_run": 381050.0,
        "avg_output_tokens_per_run": 71763.66666666667,
        "avg_tokens_per_run": 452813.6666666667,
        "avg_execution_time_per_run_sec": 2393.5536
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.3563218390804598,
            "std_dev": 0.034482758620689655,
            "variance": 0.0011890606420927466,
            "min": 0.3448275862068966,
            "max": 0.3793103448275862,
            "range": 0.03448275862068967,
            "coefficient_of_variation": 0.0967741935483871
        },
        "correct_cases": {
            "mean": 10,
            "std_dev": 1.0,
            "variance": 1,
            "min": 10,
            "max": 11,
            "range": 1
        },
        "incorrect_cases": {
            "mean": 19,
            "std_dev": 1.0,
            "variance": 1,
            "min": 18,
            "max": 19,
            "range": 1
        },
        "avg_distance_for_correct_cases": {
            "mean": 1.7857142857142856,
            "std_dev": 1.0996288798814753,
            "variance": 1.2091836734693877,
            "min": 0.8571428571428571,
            "max": 3.0,
            "range": 2.142857142857143
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 4.217171717171717,
            "std_dev": 0.18822873892509312,
            "variance": 0.035430058157330864,
            "min": 4.0,
            "max": 4.333333333333333,
            "range": 0.33333333333333304
        },
        "overall_avg_distance": {
            "mean": 3.574712643678161,
            "std_dev": 0.10534656770013422,
            "variance": 0.011097899326198963,
            "min": 3.4827586206896552,
            "max": 3.689655172413793,
            "range": 0.2068965517241379
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.04489398819108964,
            "std_dev": 0.028226976477582544,
            "variance": 0.0007967622010659982,
            "min": 0.018633540372670808,
            "max": 0.07474407637451116,
            "range": 0.05611053600184036
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.11705790146115733,
            "std_dev": 0.0052451529041163455,
            "variance": 2.751162898756013e-05,
            "min": 0.11117048485362782,
            "max": 0.12123261474240977,
            "range": 0.010062129888781954
        },
        "normalized_overall_avg_distance": {
            "mean": 0.09791122039689293,
            "std_dev": 0.0017558064570694816,
            "variance": 3.082856314686885e-06,
            "min": 0.0964673209290245,
            "max": 0.09986573739459162,
            "range": 0.003398416465567114
        },
        "correct_step_number_predictions": {
            "mean": 9.333333333333334,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 8,
            "max": 10,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 19.666666666666668,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 19,
            "max": 21,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.3793103448275862,
            "std_dev": 0.039817259944112136,
            "variance": 0.0015854141894569968,
            "min": 0.27586206896551724,
            "max": 0.3448275862068966,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-1": {
            "mean": 0.40229885057471265,
            "std_dev": 0.019908629972056068,
            "variance": 0.0003963535473642492,
            "min": 0.3793103448275862,
            "max": 0.41379310344827586,
            "range": 0.03448275862068967
        },
        "step_accuracy_within_+-2": {
            "mean": 0.4827586206896552,
            "std_dev": 0.03448275862068967,
            "variance": 0.0011890606420927477,
            "min": 0.4482758620689655,
            "max": 0.5172413793103449,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-3": {
            "mean": 0.5517241379310345,
            "std_dev": 0.03448275862068961,
            "variance": 0.0011890606420927438,
            "min": 0.5172413793103449,
            "max": 0.5862068965517241,
            "range": 0.06896551724137923
        },
        "step_accuracy_within_+-4": {
            "mean": 0.7241379310344828,
            "std_dev": 0.03448275862068961,
            "variance": 0.0011890606420927438,
            "min": 0.6896551724137931,
            "max": 0.7586206896551724,
            "range": 0.06896551724137923
        },
        "step_accuracy_within_+-5": {
            "mean": 0.7931034482758621,
            "std_dev": 0.03448275862068967,
            "variance": 0.0011890606420927477,
            "min": 0.7586206896551724,
            "max": 0.8275862068965517,
            "range": 0.06896551724137934
        },
        "total_prompt_tokens": {
            "mean": 381050,
            "std_dev": 0.0,
            "variance": 0,
            "min": 381050,
            "max": 381050,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 71763.66666666667,
            "std_dev": 3481.195388560276,
            "variance": 12118721.333333334,
            "min": 69533,
            "max": 75775,
            "range": 6242
        },
        "total_tokens": {
            "mean": 452813.6666666667,
            "std_dev": 3481.195388560276,
            "variance": 12118721.333333334,
            "min": 450583,
            "max": 456825,
            "range": 6242
        },
        "total_execution_time_sec": {
            "mean": 2393.5536333333334,
            "std_dev": 238.881222656477,
            "variance": 57064.23853785334,
            "min": 2158.0741,
            "max": 2635.6971,
            "range": 477.62300000000005
        }
    }
}