{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 15,
            "Incorrect cases": 29,
            "Average distance for correct cases": 15.933333333333334,
            "Average distance for incorrect cases": 33.275862068965516,
            "Overall average distance": 27.363636363636363,
            "Normalized average distance for correct cases": 0.23762852114127436,
            "Normalized average distance for incorrect cases": 0.4847645829597567,
            "Normalized overall average distance": 0.400513652794365,
            "Correct step number predictions": 10,
            "Incorrect step number predictions": 34,
            "Step number accuracy": 0.22727272727272727,
            "Step accuracy within +-1": 0.2727272727272727,
            "Step accuracy within +-2": 0.2727272727272727,
            "Step accuracy within +-3": 0.3409090909090909,
            "Step accuracy within +-4": 0.38636363636363635,
            "Step accuracy within +-5": 0.4090909090909091,
            "total_prompt_tokens": 1138564,
            "total_output_tokens": 91850,
            "total_tokens": 1230414,
            "total_execution_time_sec": 1853.8794
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 14,
            "Incorrect cases": 30,
            "Average distance for correct cases": 21.142857142857142,
            "Average distance for incorrect cases": 30.133333333333333,
            "Overall average distance": 27.272727272727273,
            "Normalized average distance for correct cases": 0.2782948093388254,
            "Normalized average distance for incorrect cases": 0.4213569074868266,
            "Normalized overall average distance": 0.37583714898518983,
            "Correct step number predictions": 11,
            "Incorrect step number predictions": 33,
            "Step number accuracy": 0.25,
            "Step accuracy within +-1": 0.29545454545454547,
            "Step accuracy within +-2": 0.29545454545454547,
            "Step accuracy within +-3": 0.36363636363636365,
            "Step accuracy within +-4": 0.4090909090909091,
            "Step accuracy within +-5": 0.4318181818181818,
            "total_prompt_tokens": 1138564,
            "total_output_tokens": 92874,
            "total_tokens": 1231438,
            "total_execution_time_sec": 1949.5895
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 12,
            "Incorrect cases": 32,
            "Average distance for correct cases": 15.833333333333334,
            "Average distance for incorrect cases": 34.6875,
            "Overall average distance": 29.545454545454547,
            "Normalized average distance for correct cases": 0.2373696441228802,
            "Normalized average distance for incorrect cases": 0.4554321432663652,
            "Normalized overall average distance": 0.39596055259086926,
            "Correct step number predictions": 12,
            "Incorrect step number predictions": 32,
            "Step number accuracy": 0.2727272727272727,
            "Step accuracy within +-1": 0.3181818181818182,
            "Step accuracy within +-2": 0.3181818181818182,
            "Step accuracy within +-3": 0.38636363636363635,
            "Step accuracy within +-4": 0.4090909090909091,
            "Step accuracy within +-5": 0.4318181818181818,
            "total_prompt_tokens": 1138564,
            "total_output_tokens": 94139,
            "total_tokens": 1232703,
            "total_execution_time_sec": 2084.0974
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 41,
        "overall_incorrect_cases": 91,
        "overall_total_cases": 132,
        "overall_accuracy": 0.3106060606060606,
        "overall_avg_distance_for_correct_cases": 17.682926829268293,
        "overall_avg_distance_for_incorrect_cases": 32.73626373626374,
        "overall_avg_distance": 28.060606060606062,
        "overall_normalized_avg_distance_for_correct_cases": 0.251438801886274,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.4535462496149608,
        "overall_normalized_avg_distance": 0.39077045145680805,
        "overall_correct_step_number_predictions": 33,
        "overall_incorrect_step_number_predictions": 99,
        "overall_step_number_accuracy": 0.25,
        "overall_step_accuracy_within_+-1": 0.29545454545454547,
        "overall_step_accuracy_within_+-2": 0.29545454545454547,
        "overall_step_accuracy_within_+-3": 0.36363636363636365,
        "overall_step_accuracy_within_+-4": 0.40151515151515155,
        "overall_step_accuracy_within_+-5": 0.42424242424242425,
        "grand_total_prompt_tokens": 3415692,
        "grand_total_output_tokens": 278863,
        "grand_total_tokens": 3694555,
        "grand_total_execution_time_sec": 5887.5663,
        "avg_prompt_tokens_per_run": 1138564.0,
        "avg_output_tokens_per_run": 92954.33333333333,
        "avg_tokens_per_run": 1231518.3333333333,
        "avg_execution_time_per_run_sec": 1962.5221
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.3106060606060606,
            "std_dev": 0.034716482537544245,
            "variance": 0.0012052341597796144,
            "min": 0.2727272727272727,
            "max": 0.3409090909090909,
            "range": 0.06818181818181818,
            "coefficient_of_variation": 0.11177013890136196
        },
        "correct_cases": {
            "mean": 13.666666666666666,
            "std_dev": 1.5275252316519468,
            "variance": 2.3333333333333335,
            "min": 12,
            "max": 15,
            "range": 3
        },
        "incorrect_cases": {
            "mean": 30.333333333333332,
            "std_dev": 1.5275252316519468,
            "variance": 2.3333333333333335,
            "min": 29,
            "max": 32,
            "range": 3
        },
        "avg_distance_for_correct_cases": {
            "mean": 17.636507936507936,
            "std_dev": 3.0369991056385377,
            "variance": 9.22336356764928,
            "min": 15.833333333333334,
            "max": 21.142857142857142,
            "range": 5.309523809523808
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 32.69889846743295,
            "std_dev": 2.3312601217076825,
            "variance": 5.434773755064518,
            "min": 30.133333333333333,
            "max": 34.6875,
            "range": 4.554166666666667
        },
        "overall_avg_distance": {
            "mean": 28.060606060606062,
            "std_dev": 1.286719620943161,
            "variance": 1.655647382920112,
            "min": 27.272727272727273,
            "max": 29.545454545454547,
            "range": 2.2727272727272734
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.25109765820099333,
            "std_dev": 0.023553779459743258,
            "variance": 0.0005547805268382235,
            "min": 0.2373696441228802,
            "max": 0.2782948093388254,
            "range": 0.04092516521594519
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.4538512112376495,
            "std_dev": 0.0317333867839484,
            "variance": 0.0010070078367796708,
            "min": 0.4213569074868266,
            "max": 0.4847645829597567,
            "range": 0.06340767547293014
        },
        "normalized_overall_avg_distance": {
            "mean": 0.39077045145680805,
            "std_dev": 0.01313146307147064,
            "variance": 0.0001724353223973971,
            "min": 0.37583714898518983,
            "max": 0.400513652794365,
            "range": 0.024676503809175188
        },
        "correct_step_number_predictions": {
            "mean": 11,
            "std_dev": 1.0,
            "variance": 1,
            "min": 10,
            "max": 12,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 33,
            "std_dev": 1.0,
            "variance": 1,
            "min": 32,
            "max": 34,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.25,
            "std_dev": 0.02272727272727272,
            "variance": 0.0005165289256198344,
            "min": 0.22727272727272727,
            "max": 0.2727272727272727,
            "range": 0.04545454545454544
        },
        "step_accuracy_within_+-1": {
            "mean": 0.29545454545454547,
            "std_dev": 0.022727272727272735,
            "variance": 0.000516528925619835,
            "min": 0.2727272727272727,
            "max": 0.3181818181818182,
            "range": 0.04545454545454547
        },
        "step_accuracy_within_+-2": {
            "mean": 0.29545454545454547,
            "std_dev": 0.022727272727272735,
            "variance": 0.000516528925619835,
            "min": 0.2727272727272727,
            "max": 0.3181818181818182,
            "range": 0.04545454545454547
        },
        "step_accuracy_within_+-3": {
            "mean": 0.36363636363636365,
            "std_dev": 0.022727272727272735,
            "variance": 0.000516528925619835,
            "min": 0.3409090909090909,
            "max": 0.38636363636363635,
            "range": 0.04545454545454547
        },
        "step_accuracy_within_+-4": {
            "mean": 0.40151515151515155,
            "std_dev": 0.01312159702703697,
            "variance": 0.00017217630853994545,
            "min": 0.38636363636363635,
            "max": 0.4090909090909091,
            "range": 0.022727272727272763
        },
        "step_accuracy_within_+-5": {
            "mean": 0.42424242424242425,
            "std_dev": 0.013121597027036937,
            "variance": 0.0001721763085399446,
            "min": 0.4090909090909091,
            "max": 0.4318181818181818,
            "range": 0.022727272727272707
        },
        "total_prompt_tokens": {
            "mean": 1138564,
            "std_dev": 0.0,
            "variance": 0,
            "min": 1138564,
            "max": 1138564,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 92954.33333333333,
            "std_dev": 1146.6125471724672,
            "variance": 1314720.3333333333,
            "min": 91850,
            "max": 94139,
            "range": 2289
        },
        "total_tokens": {
            "mean": 1231518.3333333333,
            "std_dev": 1146.6125471724672,
            "variance": 1314720.3333333333,
            "min": 1230414,
            "max": 1232703,
            "range": 2289
        },
        "total_execution_time_sec": {
            "mean": 1962.5221000000001,
            "std_dev": 115.65258746811507,
            "variance": 13375.52098807001,
            "min": 1853.8794,
            "max": 2084.0974,
            "range": 230.21800000000007
        }
    }
}