{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 23,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.21739130434782608,
            "Average distance for incorrect cases": 0.47368421052631576,
            "Overall average distance": 0.3333333333333333,
            "Normalized average distance for correct cases": 0.007101272318663622,
            "Normalized average distance for incorrect cases": 0.015423129345030873,
            "Normalized overall average distance": 0.01086592192582976,
            "Correct step number predictions": 30,
            "Incorrect step number predictions": 12,
            "Step number accuracy": 0.7142857142857143,
            "Step accuracy within +-1": 0.9523809523809523,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 343515,
            "total_output_tokens": 79489,
            "total_tokens": 423004,
            "total_execution_time_sec": 837.2567
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 22,
            "Incorrect cases": 20,
            "Average distance for correct cases": 0.22727272727272727,
            "Average distance for incorrect cases": 0.5,
            "Overall average distance": 0.35714285714285715,
            "Normalized average distance for correct cases": 0.00788100788100788,
            "Normalized average distance for incorrect cases": 0.01833092833092833,
            "Normalized overall average distance": 0.012857160476208097,
            "Correct step number predictions": 31,
            "Incorrect step number predictions": 11,
            "Step number accuracy": 0.7380952380952381,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 0.9761904761904762,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 343515,
            "total_output_tokens": 70004,
            "total_tokens": 413519,
            "total_execution_time_sec": 733.6042
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 25,
            "Incorrect cases": 17,
            "Average distance for correct cases": 0.28,
            "Average distance for incorrect cases": 0.4117647058823529,
            "Overall average distance": 0.3333333333333333,
            "Normalized average distance for correct cases": 0.008753468753468755,
            "Normalized average distance for incorrect cases": 0.017802676626206035,
            "Normalized overall average distance": 0.01241624336862432,
            "Correct step number predictions": 31,
            "Incorrect step number predictions": 11,
            "Step number accuracy": 0.7380952380952381,
            "Step accuracy within +-1": 0.9523809523809523,
            "Step accuracy within +-2": 0.9761904761904762,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 343515,
            "total_output_tokens": 74406,
            "total_tokens": 417921,
            "total_execution_time_sec": 741.4046
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 70,
        "overall_incorrect_cases": 56,
        "overall_total_cases": 126,
        "overall_accuracy": 0.5555555555555556,
        "overall_avg_distance_for_correct_cases": 0.24285714285714285,
        "overall_avg_distance_for_incorrect_cases": 0.4642857142857143,
        "overall_avg_distance": 0.3412698412698413,
        "overall_normalized_avg_distance_for_correct_cases": 0.007936402222116508,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.017183991550350997,
        "overall_normalized_avg_distance": 0.01204644192355406,
        "overall_correct_step_number_predictions": 92,
        "overall_incorrect_step_number_predictions": 34,
        "overall_step_number_accuracy": 0.7301587301587301,
        "overall_step_accuracy_within_+-1": 0.9444444444444444,
        "overall_step_accuracy_within_+-2": 0.9841269841269841,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 1030545,
        "grand_total_output_tokens": 223899,
        "grand_total_tokens": 1254444,
        "grand_total_execution_time_sec": 2312.2655,
        "avg_prompt_tokens_per_run": 343515.0,
        "avg_output_tokens_per_run": 74633.0,
        "avg_tokens_per_run": 418148.0,
        "avg_execution_time_per_run_sec": 770.7552
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.5555555555555556,
            "std_dev": 0.036369648372665375,
            "variance": 0.0013227513227513214,
            "min": 0.5238095238095238,
            "max": 0.5952380952380952,
            "range": 0.0714285714285714,
            "coefficient_of_variation": 0.06546536707079767
        },
        "correct_cases": {
            "mean": 23.333333333333332,
            "std_dev": 1.5275252316519468,
            "variance": 2.3333333333333335,
            "min": 22,
            "max": 25,
            "range": 3
        },
        "incorrect_cases": {
            "mean": 18.666666666666668,
            "std_dev": 1.5275252316519468,
            "variance": 2.3333333333333335,
            "min": 17,
            "max": 20,
            "range": 3
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.24155467720685111,
            "std_dev": 0.033659215131331445,
            "variance": 0.0011329427632572515,
            "min": 0.21739130434782608,
            "max": 0.28,
            "range": 0.06260869565217395
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.4618163054695562,
            "std_dev": 0.045299030451369654,
            "variance": 0.0020520021598341154,
            "min": 0.4117647058823529,
            "max": 0.5,
            "range": 0.08823529411764708
        },
        "overall_avg_distance": {
            "mean": 0.3412698412698413,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.3333333333333333,
            "max": 0.35714285714285715,
            "range": 0.023809523809523836
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.00791191631771342,
            "std_dev": 0.0008265317679257,
            "variance": 6.831547633903831e-07,
            "min": 0.007101272318663622,
            "max": 0.008753468753468755,
            "range": 0.0016521964348051327
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.017185578100721747,
            "std_dev": 0.0015490099025547684,
            "variance": 2.399431678212733e-06,
            "min": 0.015423129345030873,
            "max": 0.01833092833092833,
            "range": 0.002907798985897457
        },
        "normalized_overall_avg_distance": {
            "mean": 0.01204644192355406,
            "std_dev": 0.0010458597289838844,
            "variance": 1.0938225727102445e-06,
            "min": 0.01086592192582976,
            "max": 0.012857160476208097,
            "range": 0.001991238550378336
        },
        "correct_step_number_predictions": {
            "mean": 30.666666666666668,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 30,
            "max": 31,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 11.333333333333334,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 11,
            "max": 12,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.7301587301587302,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.7142857142857143,
            "max": 0.7380952380952381,
            "range": 0.023809523809523836
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9444444444444444,
            "std_dev": 0.013746434980705326,
            "variance": 0.00018896447467875904,
            "min": 0.9285714285714286,
            "max": 0.9523809523809523,
            "range": 0.023809523809523725
        },
        "step_accuracy_within_+-2": {
            "mean": 0.9841269841269841,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.9761904761904762,
            "max": 1.0,
            "range": 0.023809523809523836
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 343515,
            "std_dev": 0.0,
            "variance": 0,
            "min": 343515,
            "max": 343515,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 74633,
            "std_dev": 4746.572763584269,
            "variance": 22529953,
            "min": 70004,
            "max": 79489,
            "range": 9485
        },
        "total_tokens": {
            "mean": 418148,
            "std_dev": 4746.572763584269,
            "variance": 22529953,
            "min": 413519,
            "max": 423004,
            "range": 9485
        },
        "total_execution_time_sec": {
            "mean": 770.7551666666667,
            "std_dev": 57.72392928243309,
            "variance": 3332.0520118033364,
            "min": 733.6042,
            "max": 837.2567,
            "range": 103.65250000000003
        }
    }
}