{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 21,
            "Incorrect cases": 21,
            "Average distance for correct cases": 0.42857142857142855,
            "Average distance for incorrect cases": 0.38095238095238093,
            "Overall average distance": 0.40476190476190477,
            "Normalized average distance for correct cases": 0.012486807724902963,
            "Normalized average distance for incorrect cases": 0.015597442680776013,
            "Normalized overall average distance": 0.014042125202839487,
            "Correct step number predictions": 29,
            "Incorrect step number predictions": 13,
            "Step number accuracy": 0.6904761904761905,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 0.9761904761904762,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 592602,
            "total_output_tokens": 64764,
            "total_tokens": 657366,
            "total_execution_time_sec": 589.8992
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 22,
            "Incorrect cases": 20,
            "Average distance for correct cases": 0.5,
            "Average distance for incorrect cases": 0.3,
            "Overall average distance": 0.40476190476190477,
            "Normalized average distance for correct cases": 0.014444478080841715,
            "Normalized average distance for incorrect cases": 0.01388888888888889,
            "Normalized overall average distance": 0.014179911798959418,
            "Correct step number predictions": 29,
            "Incorrect step number predictions": 13,
            "Step number accuracy": 0.6904761904761905,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 0.9761904761904762,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 592602,
            "total_output_tokens": 67115,
            "total_tokens": 659717,
            "total_execution_time_sec": 640.7219
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 23,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.4782608695652174,
            "Average distance for incorrect cases": 0.05263157894736842,
            "Overall average distance": 0.2857142857142857,
            "Normalized average distance for correct cases": 0.013816457294718162,
            "Normalized average distance for incorrect cases": 0.0029239766081871343,
            "Normalized overall average distance": 0.008888906507954127,
            "Correct step number predictions": 31,
            "Incorrect step number predictions": 11,
            "Step number accuracy": 0.7380952380952381,
            "Step accuracy within +-1": 0.9761904761904762,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 592602,
            "total_output_tokens": 69772,
            "total_tokens": 662374,
            "total_execution_time_sec": 627.172
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 66,
        "overall_incorrect_cases": 60,
        "overall_total_cases": 126,
        "overall_accuracy": 0.5238095238095238,
        "overall_avg_distance_for_correct_cases": 0.4696969696969697,
        "overall_avg_distance_for_incorrect_cases": 0.25,
        "overall_avg_distance": 0.36507936507936506,
        "overall_normalized_avg_distance_for_correct_cases": 0.013602727239090872,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.01101466049382716,
        "overall_normalized_avg_distance": 0.01237031450325101,
        "overall_correct_step_number_predictions": 89,
        "overall_incorrect_step_number_predictions": 37,
        "overall_step_number_accuracy": 0.7063492063492064,
        "overall_step_accuracy_within_+-1": 0.9444444444444444,
        "overall_step_accuracy_within_+-2": 0.9841269841269841,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 1777806,
        "grand_total_output_tokens": 201651,
        "grand_total_tokens": 1979457,
        "grand_total_execution_time_sec": 1857.7931,
        "avg_prompt_tokens_per_run": 592602.0,
        "avg_output_tokens_per_run": 67217.0,
        "avg_tokens_per_run": 659819.0,
        "avg_execution_time_per_run_sec": 619.2644
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.5238095238095238,
            "std_dev": 0.023809523809523836,
            "variance": 0.0005668934240362824,
            "min": 0.5,
            "max": 0.5476190476190477,
            "range": 0.04761904761904767,
            "coefficient_of_variation": 0.045454545454545504
        },
        "correct_cases": {
            "mean": 22,
            "std_dev": 1.0,
            "variance": 1,
            "min": 21,
            "max": 23,
            "range": 2
        },
        "incorrect_cases": {
            "mean": 20,
            "std_dev": 1.0,
            "variance": 1,
            "min": 19,
            "max": 21,
            "range": 2
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.468944099378882,
            "std_dev": 0.03661436684022236,
            "variance": 0.0013406118591103749,
            "min": 0.42857142857142855,
            "max": 0.5,
            "range": 0.07142857142857145
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.2445279866332498,
            "std_dev": 0.17104530235444743,
            "variance": 0.029256495457524342,
            "min": 0.05263157894736842,
            "max": 0.38095238095238093,
            "range": 0.3283208020050125
        },
        "overall_avg_distance": {
            "mean": 0.36507936507936506,
            "std_dev": 0.06873217490352689,
            "variance": 0.004724111866969012,
            "min": 0.2857142857142857,
            "max": 0.40476190476190477,
            "range": 0.11904761904761907
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.013582581033487614,
            "std_dev": 0.000999570850318138,
            "variance": 9.99141884805725e-07,
            "min": 0.012486807724902963,
            "max": 0.014444478080841715,
            "range": 0.0019576703559387523
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.010803436059284013,
            "std_dev": 0.006877077863893422,
            "variance": 4.7294199946052904e-05,
            "min": 0.0029239766081871343,
            "max": 0.015597442680776013,
            "range": 0.012673466072588879
        },
        "normalized_overall_avg_distance": {
            "mean": 0.01237031450325101,
            "std_dev": 0.003015774777533193,
            "variance": 9.09489750880538e-06,
            "min": 0.008888906507954127,
            "max": 0.014179911798959418,
            "range": 0.005291005291005291
        },
        "correct_step_number_predictions": {
            "mean": 29.666666666666668,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 29,
            "max": 31,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 12.333333333333334,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 11,
            "max": 13,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.7063492063492064,
            "std_dev": 0.02749286996141078,
            "variance": 0.0007558578987150432,
            "min": 0.6904761904761905,
            "max": 0.7380952380952381,
            "range": 0.04761904761904767
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9444444444444444,
            "std_dev": 0.027492869961410718,
            "variance": 0.0007558578987150397,
            "min": 0.9285714285714286,
            "max": 0.9761904761904762,
            "range": 0.04761904761904756
        },
        "step_accuracy_within_+-2": {
            "mean": 0.9841269841269841,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.9761904761904762,
            "max": 1.0,
            "range": 0.023809523809523836
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 592602,
            "std_dev": 0.0,
            "variance": 0,
            "min": 592602,
            "max": 592602,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 67217,
            "std_dev": 2505.557622566282,
            "variance": 6277819,
            "min": 64764,
            "max": 69772,
            "range": 5008
        },
        "total_tokens": {
            "mean": 659819,
            "std_dev": 2505.557622566282,
            "variance": 6277819,
            "min": 657366,
            "max": 662374,
            "range": 5008
        },
        "total_execution_time_sec": {
            "mean": 619.2643666666667,
            "std_dev": 26.3179540907597,
            "variance": 692.6347075233351,
            "min": 589.8992,
            "max": 640.7219,
            "range": 50.822700000000054
        }
    }
}