{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 9,
            "Incorrect cases": 20,
            "Average distance for correct cases": 1.7777777777777777,
            "Average distance for incorrect cases": 9.95,
            "Overall average distance": 7.413793103448276,
            "Normalized average distance for correct cases": 0.0463768115942029,
            "Normalized average distance for incorrect cases": 0.25990870557120427,
            "Normalized overall average distance": 0.19364018675075556,
            "Correct step number predictions": 9,
            "Incorrect step number predictions": 20,
            "Step number accuracy": 0.3103448275862069,
            "Step accuracy within +-1": 0.3448275862068966,
            "Step accuracy within +-2": 0.41379310344827586,
            "Step accuracy within +-3": 0.4827586206896552,
            "Step accuracy within +-4": 0.5862068965517241,
            "Step accuracy within +-5": 0.6551724137931034,
            "total_prompt_tokens": 275427,
            "total_output_tokens": 51160,
            "total_tokens": 326587,
            "total_execution_time_sec": 589.0912
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 8,
            "Incorrect cases": 21,
            "Average distance for correct cases": 1.3333333333333333,
            "Average distance for incorrect cases": 6.65,
            "Overall average distance": 5.0,
            "Normalized average distance for correct cases": 0.03671497584541063,
            "Normalized average distance for incorrect cases": 0.1899778970525341,
            "Normalized overall average distance": 0.14241354219515096,
            "Correct step number predictions": 9,
            "Incorrect step number predictions": 20,
            "Step number accuracy": 0.3103448275862069,
            "Step accuracy within +-1": 0.3448275862068966,
            "Step accuracy within +-2": 0.4827586206896552,
            "Step accuracy within +-3": 0.5862068965517241,
            "Step accuracy within +-4": 0.6206896551724138,
            "Step accuracy within +-5": 0.6551724137931034,
            "total_prompt_tokens": 275427,
            "total_output_tokens": 46581,
            "total_tokens": 322008,
            "total_execution_time_sec": 537.5138
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 7,
            "Incorrect cases": 22,
            "Average distance for correct cases": 1.4285714285714286,
            "Average distance for incorrect cases": 6.954545454545454,
            "Overall average distance": 5.620689655172414,
            "Normalized average distance for correct cases": 0.03768115942028986,
            "Normalized average distance for incorrect cases": 0.20781083537235456,
            "Normalized overall average distance": 0.16674505152185617,
            "Correct step number predictions": 10,
            "Incorrect step number predictions": 19,
            "Step number accuracy": 0.3448275862068966,
            "Step accuracy within +-1": 0.3448275862068966,
            "Step accuracy within +-2": 0.4482758620689655,
            "Step accuracy within +-3": 0.5517241379310345,
            "Step accuracy within +-4": 0.6551724137931034,
            "Step accuracy within +-5": 0.7241379310344828,
            "total_prompt_tokens": 275427,
            "total_output_tokens": 57775,
            "total_tokens": 333202,
            "total_execution_time_sec": 646.9498
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 24,
        "overall_incorrect_cases": 63,
        "overall_total_cases": 87,
        "overall_accuracy": 0.27586206896551724,
        "overall_avg_distance_for_correct_cases": 1.52,
        "overall_avg_distance_for_incorrect_cases": 7.82258064516129,
        "overall_avg_distance": 6.011494252873563,
        "overall_normalized_avg_distance_for_correct_cases": 0.04046376811594203,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.21886403920429948,
        "overall_normalized_avg_distance": 0.16759959348925424,
        "overall_correct_step_number_predictions": 28,
        "overall_incorrect_step_number_predictions": 59,
        "overall_step_number_accuracy": 0.3218390804597701,
        "overall_step_accuracy_within_+-1": 0.3448275862068966,
        "overall_step_accuracy_within_+-2": 0.4482758620689655,
        "overall_step_accuracy_within_+-3": 0.5402298850574713,
        "overall_step_accuracy_within_+-4": 0.6206896551724138,
        "overall_step_accuracy_within_+-5": 0.6781609195402298,
        "grand_total_prompt_tokens": 826281,
        "grand_total_output_tokens": 155516,
        "grand_total_tokens": 981797,
        "grand_total_execution_time_sec": 1773.5548,
        "avg_prompt_tokens_per_run": 275427.0,
        "avg_output_tokens_per_run": 51838.666666666664,
        "avg_tokens_per_run": 327265.6666666667,
        "avg_execution_time_per_run_sec": 591.1849
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.27586206896551724,
            "std_dev": 0.03981725994411212,
            "variance": 0.0015854141894569955,
            "min": 0.2413793103448276,
            "max": 0.3103448275862069,
            "range": 0.06896551724137931,
            "coefficient_of_variation": 0.13856406460551018
        },
        "correct_cases": {
            "mean": 8.333333333333334,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 7,
            "max": 9,
            "range": 2
        },
        "incorrect_cases": {
            "mean": 20.666666666666668,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 20,
            "max": 22,
            "range": 2
        },
        "avg_distance_for_correct_cases": {
            "mean": 1.5132275132275133,
            "std_dev": 0.23400364431212675,
            "variance": 0.05475770555135633,
            "min": 1.3333333333333333,
            "max": 1.7777777777777777,
            "range": 0.4444444444444444
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 7.851515151515152,
            "std_dev": 1.8237094006341665,
            "variance": 3.325915977961431,
            "min": 6.65,
            "max": 9.95,
            "range": 3.299999999999999
        },
        "overall_avg_distance": {
            "mean": 6.011494252873563,
            "std_dev": 1.2534534142726974,
            "variance": 1.5711454617518825,
            "min": 5.0,
            "max": 7.413793103448276,
            "range": 2.413793103448276
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.040257648953301126,
            "std_dev": 0.005321324200276427,
            "variance": 2.8316491244447557e-05,
            "min": 0.03671497584541063,
            "max": 0.0463768115942029,
            "range": 0.009661835748792272
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.21923247933203097,
            "std_dev": 0.036337583272349545,
            "variance": 0.0013204199580749377,
            "min": 0.1899778970525341,
            "max": 0.25990870557120427,
            "range": 0.06993080851867017
        },
        "normalized_overall_avg_distance": {
            "mean": 0.16759959348925424,
            "std_dev": 0.02562401138750717,
            "variance": 0.0006565899595870971,
            "min": 0.14241354219515096,
            "max": 0.19364018675075556,
            "range": 0.051226644555604606
        },
        "correct_step_number_predictions": {
            "mean": 9.333333333333334,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 9,
            "max": 10,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 19.666666666666668,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 19,
            "max": 20,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.3218390804597701,
            "std_dev": 0.019908629972056068,
            "variance": 0.0003963535473642492,
            "min": 0.3103448275862069,
            "max": 0.3448275862068966,
            "range": 0.03448275862068967
        },
        "step_accuracy_within_+-1": {
            "mean": 0.3448275862068966,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.3448275862068966,
            "max": 0.3448275862068966,
            "range": 0.0
        },
        "step_accuracy_within_+-2": {
            "mean": 0.4482758620689655,
            "std_dev": 0.03448275862068967,
            "variance": 0.0011890606420927477,
            "min": 0.41379310344827586,
            "max": 0.4827586206896552,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-3": {
            "mean": 0.5402298850574713,
            "std_dev": 0.052673283850067094,
            "variance": 0.0027744748315497387,
            "min": 0.4827586206896552,
            "max": 0.5862068965517241,
            "range": 0.1034482758620689
        },
        "step_accuracy_within_+-4": {
            "mean": 0.6206896551724138,
            "std_dev": 0.03448275862068967,
            "variance": 0.0011890606420927477,
            "min": 0.5862068965517241,
            "max": 0.6551724137931034,
            "range": 0.06896551724137934
        },
        "step_accuracy_within_+-5": {
            "mean": 0.6781609195402298,
            "std_dev": 0.039817259944112136,
            "variance": 0.0015854141894569968,
            "min": 0.6551724137931034,
            "max": 0.7241379310344828,
            "range": 0.06896551724137934
        },
        "total_prompt_tokens": {
            "mean": 275427,
            "std_dev": 0.0,
            "variance": 0,
            "min": 275427,
            "max": 275427,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 51838.666666666664,
            "std_dev": 5627.774900734155,
            "variance": 31671850.333333332,
            "min": 46581,
            "max": 57775,
            "range": 11194
        },
        "total_tokens": {
            "mean": 327265.6666666667,
            "std_dev": 5627.774900734155,
            "variance": 31671850.333333332,
            "min": 322008,
            "max": 333202,
            "range": 11194
        },
        "total_execution_time_sec": {
            "mean": 591.1849333333333,
            "std_dev": 54.74803479078802,
            "variance": 2997.347313453335,
            "min": 537.5138,
            "max": 646.9498,
            "range": 109.43600000000004
        }
    }
}