{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 11,
            "Incorrect cases": 18,
            "Average distance for correct cases": 0.8888888888888888,
            "Average distance for incorrect cases": 4.25,
            "Overall average distance": 3.206896551724138,
            "Normalized average distance for correct cases": 0.02242926155969634,
            "Normalized average distance for incorrect cases": 0.11410221321298775,
            "Normalized overall average distance": 0.08565198683782835,
            "Correct step number predictions": 11,
            "Incorrect step number predictions": 18,
            "Step number accuracy": 0.3793103448275862,
            "Step accuracy within +-1": 0.41379310344827586,
            "Step accuracy within +-2": 0.5517241379310345,
            "Step accuracy within +-3": 0.6206896551724138,
            "Step accuracy within +-4": 0.7931034482758621,
            "Step accuracy within +-5": 0.8620689655172413,
            "total_prompt_tokens": 361040,
            "total_output_tokens": 68546,
            "total_tokens": 429586,
            "total_execution_time_sec": 2250.0683
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 19,
            "Average distance for correct cases": 1.3333333333333333,
            "Average distance for incorrect cases": 3.1,
            "Overall average distance": 2.5517241379310347,
            "Normalized average distance for correct cases": 0.03671497584541063,
            "Normalized average distance for incorrect cases": 0.07989939848385721,
            "Normalized overall average distance": 0.06649733628571862,
            "Correct step number predictions": 13,
            "Incorrect step number predictions": 16,
            "Step number accuracy": 0.4482758620689655,
            "Step accuracy within +-1": 0.5172413793103449,
            "Step accuracy within +-2": 0.5517241379310345,
            "Step accuracy within +-3": 0.6206896551724138,
            "Step accuracy within +-4": 0.7586206896551724,
            "Step accuracy within +-5": 0.8275862068965517,
            "total_prompt_tokens": 361040,
            "total_output_tokens": 73145,
            "total_tokens": 434185,
            "total_execution_time_sec": 2510.6979
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 19,
            "Average distance for correct cases": 1.3333333333333333,
            "Average distance for incorrect cases": 5.2,
            "Overall average distance": 4.0,
            "Normalized average distance for correct cases": 0.03671497584541063,
            "Normalized average distance for incorrect cases": 0.15095203006280458,
            "Normalized overall average distance": 0.11549915116775128,
            "Correct step number predictions": 12,
            "Incorrect step number predictions": 17,
            "Step number accuracy": 0.41379310344827586,
            "Step accuracy within +-1": 0.4482758620689655,
            "Step accuracy within +-2": 0.4827586206896552,
            "Step accuracy within +-3": 0.5517241379310345,
            "Step accuracy within +-4": 0.6896551724137931,
            "Step accuracy within +-5": 0.7586206896551724,
            "total_prompt_tokens": 361040,
            "total_output_tokens": 71206,
            "total_tokens": 432246,
            "total_execution_time_sec": 2230.1328
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 31,
        "overall_incorrect_cases": 56,
        "overall_total_cases": 87,
        "overall_accuracy": 0.3563218390804598,
        "overall_avg_distance_for_correct_cases": 1.1851851851851851,
        "overall_avg_distance_for_incorrect_cases": 4.183333333333334,
        "overall_avg_distance": 3.2528735632183907,
        "overall_normalized_avg_distance_for_correct_cases": 0.03195307108350586,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.11498454725321651,
        "overall_normalized_avg_distance": 0.08921615809709942,
        "overall_correct_step_number_predictions": 36,
        "overall_incorrect_step_number_predictions": 51,
        "overall_step_number_accuracy": 0.41379310344827586,
        "overall_step_accuracy_within_+-1": 0.45977011494252873,
        "overall_step_accuracy_within_+-2": 0.5287356321839081,
        "overall_step_accuracy_within_+-3": 0.5977011494252874,
        "overall_step_accuracy_within_+-4": 0.7471264367816092,
        "overall_step_accuracy_within_+-5": 0.8160919540229885,
        "grand_total_prompt_tokens": 1083120,
        "grand_total_output_tokens": 212897,
        "grand_total_tokens": 1296017,
        "grand_total_execution_time_sec": 6990.899,
        "avg_prompt_tokens_per_run": 361040.0,
        "avg_output_tokens_per_run": 70965.66666666667,
        "avg_tokens_per_run": 432005.6666666667,
        "avg_execution_time_per_run_sec": 2330.2997
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.3563218390804598,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.344827586275862,
            "max": 0.379310344827586,
            "range": 0.03448275862068967,
            "coefficient_of_variation": 0.0
        },
        "correct_cases": {
            "mean": 10,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 10,
            "max": 11,
            "range": 1
        },
        "incorrect_cases": {
            "mean": 19,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 18,
            "max": 19,
            "range": 1
        },
        "avg_distance_for_correct_cases": {
            "mean": 1.1851851851851851,
            "std_dev": 0.25660011963983365,
            "variance": 0.06584362139917695,
            "min": 0.8888888888888888,
            "max": 1.3333333333333333,
            "range": 0.4444444444444444
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 4.183333333333334,
            "std_dev": 1.051586103623157,
            "variance": 1.1058333333333334,
            "min": 3.1,
            "max": 5.2,
            "range": 2.1
        },
        "overall_avg_distance": {
            "mean": 3.2528735632183907,
            "std_dev": 0.7252317956021772,
            "variance": 0.5259611573523582,
            "min": 2.5517241379310347,
            "max": 4.0,
            "range": 1.4482758620689653
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.03195307108350587,
            "std_dev": 0.008247860988423226,
            "variance": 6.802721088435375e-05,
            "min": 0.02242926155969634,
            "max": 0.03671497584541063,
            "range": 0.014285714285714287
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.11498454725321651,
            "std_dev": 0.03553453248028342,
            "variance": 0.001262702998592317,
            "min": 0.07989939848385721,
            "max": 0.15095203006280458,
            "range": 0.07105263157894737
        },
        "normalized_overall_avg_distance": {
            "mean": 0.08921615809709942,
            "std_dev": 0.024694573351392603,
            "variance": 0.0006098219530073098,
            "min": 0.06649733628571862,
            "max": 0.11549915116775128,
            "range": 0.04900181488203266
        },
        "correct_step_number_predictions": {
            "mean": 12,
            "std_dev": 1.0,
            "variance": 1,
            "min": 11,
            "max": 13,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 17,
            "std_dev": 1.0,
            "variance": 1,
            "min": 16,
            "max": 18,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.41379310344827586,
            "std_dev": 0.03448275862068967,
            "variance": 0.0011890606420927477,
            "min": 0.3793103448275862,
            "max": 0.4482758620689655,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-1": {
            "mean": 0.45977011494252873,
            "std_dev": 0.05267328385006715,
            "variance": 0.0027744748315497447,
            "min": 0.41379310344827586,
            "max": 0.5172413793103449,
            "range": 0.103448275862069
        },
        "step_accuracy_within_+-2": {
            "mean": 0.5287356321839081,
            "std_dev": 0.0398172599441121,
            "variance": 0.0015854141894569944,
            "min": 0.4827586206896552,
            "max": 0.5517241379310345,
            "range": 0.06896551724137928
        },
        "step_accuracy_within_+-3": {
            "mean": 0.5977011494252874,
            "std_dev": 0.039817259944112136,
            "variance": 0.0015854141894569968,
            "min": 0.5517241379310345,
            "max": 0.6206896551724138,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-4": {
            "mean": 0.7471264367816092,
            "std_dev": 0.05267328385006711,
            "variance": 0.002774474831549741,
            "min": 0.6896551724137931,
            "max": 0.7931034482758621,
            "range": 0.10344827586206895
        },
        "step_accuracy_within_+-5": {
            "mean": 0.8160919540229885,
            "std_dev": 0.05267328385006712,
            "variance": 0.002774474831549742,
            "min": 0.7586206896551724,
            "max": 0.8620689655172413,
            "range": 0.10344827586206895
        },
        "total_prompt_tokens": {
            "mean": 361040,
            "std_dev": 0.0,
            "variance": 0,
            "min": 361040,
            "max": 361040,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 70965.66666666667,
            "std_dev": 2308.9002432615694,
            "variance": 5331020.333333333,
            "min": 68546,
            "max": 73145,
            "range": 4599
        },
        "total_tokens": {
            "mean": 432005.6666666667,
            "std_dev": 2308.9002432615694,
            "variance": 5331020.333333333,
            "min": 429586,
            "max": 434185,
            "range": 4599
        },
        "total_execution_time_sec": {
            "mean": 2330.299666666667,
            "std_dev": 156.54711106374137,
            "variance": 24506.997982403376,
            "min": 2230.1328,
            "max": 2510.6979,
            "range": 280.56510000000026
        }
    }
}