{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 17,
            "Incorrect cases": 27,
            "Average distance for correct cases": 3.0588235294117645,
            "Average distance for incorrect cases": 17.37037037037037,
            "Overall average distance": 11.840909090909092,
            "Normalized average distance for correct cases": 0.10544835072139327,
            "Normalized average distance for incorrect cases": 0.266162660663885,
            "Normalized overall average distance": 0.20406849545883138,
            "Correct step number predictions": 13,
            "Incorrect step number predictions": 31,
            "Step number accuracy": 0.29545454545454547,
            "Step accuracy within +-1": 0.4318181818181818,
            "Step accuracy within +-2": 0.45454545454545453,
            "Step accuracy within +-3": 0.5,
            "Step accuracy within +-4": 0.5454545454545454,
            "Step accuracy within +-5": 0.5454545454545454,
            "total_prompt_tokens": 1672714,
            "total_output_tokens": 63069,
            "total_tokens": 1735783,
            "total_execution_time_sec": 1413.9332
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 13,
            "Incorrect cases": 31,
            "Average distance for correct cases": 2.769230769230769,
            "Average distance for incorrect cases": 17.0,
            "Overall average distance": 12.795454545454545,
            "Normalized average distance for correct cases": 0.1026717447486843,
            "Normalized average distance for incorrect cases": 0.2750055207537316,
            "Normalized overall average distance": 0.2240887232976949,
            "Correct step number predictions": 12,
            "Incorrect step number predictions": 32,
            "Step number accuracy": 0.2727272727272727,
            "Step accuracy within +-1": 0.38636363636363635,
            "Step accuracy within +-2": 0.4090909090909091,
            "Step accuracy within +-3": 0.45454545454545453,
            "Step accuracy within +-4": 0.5227272727272727,
            "Step accuracy within +-5": 0.5227272727272727,
            "total_prompt_tokens": 1672714,
            "total_output_tokens": 56942,
            "total_tokens": 1729656,
            "total_execution_time_sec": 1196.3355
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 15,
            "Incorrect cases": 29,
            "Average distance for correct cases": 11.666666666666666,
            "Average distance for incorrect cases": 18.724137931034484,
            "Overall average distance": 16.318181818181817,
            "Normalized average distance for correct cases": 0.22838956725934897,
            "Normalized average distance for incorrect cases": 0.28385474699078567,
            "Normalized overall average distance": 0.26494616299143225,
            "Correct step number predictions": 12,
            "Incorrect step number predictions": 32,
            "Step number accuracy": 0.2727272727272727,
            "Step accuracy within +-1": 0.38636363636363635,
            "Step accuracy within +-2": 0.4090909090909091,
            "Step accuracy within +-3": 0.45454545454545453,
            "Step accuracy within +-4": 0.5227272727272727,
            "Step accuracy within +-5": 0.5227272727272727,
            "total_prompt_tokens": 1672714,
            "total_output_tokens": 57984,
            "total_tokens": 1730698,
            "total_execution_time_sec": 1755.8725
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 45,
        "overall_incorrect_cases": 87,
        "overall_total_cases": 132,
        "overall_accuracy": 0.3409090909090909,
        "overall_avg_distance_for_correct_cases": 5.844444444444444,
        "overall_avg_distance_for_incorrect_cases": 17.689655172413794,
        "overall_avg_distance": 13.651515151515152,
        "overall_normalized_avg_distance_for_correct_cases": 0.14562662561970702,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.2752109269427972,
        "overall_normalized_avg_distance": 0.23103446058265278,
        "overall_correct_step_number_predictions": 37,
        "overall_incorrect_step_number_predictions": 95,
        "overall_step_number_accuracy": 0.2803030303030303,
        "overall_step_accuracy_within_+-1": 0.4015151515151515,
        "overall_step_accuracy_within_+-2": 0.42424242424242425,
        "overall_step_accuracy_within_+-3": 0.46969696969696967,
        "overall_step_accuracy_within_+-4": 0.5303030303030303,
        "overall_step_accuracy_within_+-5": 0.5303030303030303,
        "grand_total_prompt_tokens": 5018142,
        "grand_total_output_tokens": 177995,
        "grand_total_tokens": 5196137,
        "grand_total_execution_time_sec": 4366.1412,
        "avg_prompt_tokens_per_run": 1672714.0,
        "avg_output_tokens_per_run": 59331.666666666664,
        "avg_tokens_per_run": 1732045.6666666667,
        "avg_execution_time_per_run_sec": 1455.3804
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.3409090909090909,
            "std_dev": 0.04545454545454544,
            "variance": 0.0020661157024793376,
            "min": 0.29545454545454547,
            "max": 0.38636363636363635,
            "range": 0.09090909090909088,
            "coefficient_of_variation": 0.1333333333333333
        },
        "correct_cases": {
            "mean": 15,
            "std_dev": 2.0,
            "variance": 4,
            "min": 13,
            "max": 17,
            "range": 4
        },
        "incorrect_cases": {
            "mean": 29,
            "std_dev": 2.0,
            "variance": 4,
            "min": 27,
            "max": 31,
            "range": 4
        },
        "avg_distance_for_correct_cases": {
            "mean": 5.831573655103067,
            "std_dev": 5.05541282507038,
            "variance": 25.55719883188608,
            "min": 2.769230769230769,
            "max": 11.666666666666666,
            "range": 8.897435897435898
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 17.698169433801617,
            "std_dev": 0.9076078838340763,
            "variance": 0.8237520707977702,
            "min": 17.0,
            "max": 18.724137931034484,
            "range": 1.724137931034484
        },
        "overall_avg_distance": {
            "mean": 13.65151515151515,
            "std_dev": 2.358203254499424,
            "variance": 5.561122589531675,
            "min": 11.840909090909092,
            "max": 16.318181818181817,
            "range": 4.477272727272725
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.14550322090980886,
            "std_dev": 0.07179510563794626,
            "variance": 0.0051545371935638636,
            "min": 0.1026717447486843,
            "max": 0.22838956725934897,
            "range": 0.12571782251066466
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.27500764280280077,
            "std_dev": 0.008846043354344675,
            "variance": 7.825248302694559e-05,
            "min": 0.266162660663885,
            "max": 0.28385474699078567,
            "range": 0.01769208632690067
        },
        "normalized_overall_avg_distance": {
            "mean": 0.23103446058265284,
            "std_dev": 0.031027488633084906,
            "variance": 0.0009627050508762131,
            "min": 0.20406849545883138,
            "max": 0.26494616299143225,
            "range": 0.060877667532600876
        },
        "correct_step_number_predictions": {
            "mean": 12.333333333333334,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 12,
            "max": 13,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 31.666666666666668,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 31,
            "max": 32,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.2803030303030303,
            "std_dev": 0.01312159702703697,
            "variance": 0.00017217630853994545,
            "min": 0.2727272727272727,
            "max": 0.29545454545454547,
            "range": 0.022727272727272763
        },
        "step_accuracy_within_+-1": {
            "mean": 0.4015151515151515,
            "std_dev": 0.026243194054073906,
            "variance": 0.0006887052341597801,
            "min": 0.38636363636363635,
            "max": 0.4318181818181818,
            "range": 0.04545454545454547
        },
        "step_accuracy_within_+-2": {
            "mean": 0.42424242424242425,
            "std_dev": 0.026243194054073875,
            "variance": 0.0006887052341597784,
            "min": 0.4090909090909091,
            "max": 0.45454545454545453,
            "range": 0.045454545454545414
        },
        "step_accuracy_within_+-3": {
            "mean": 0.46969696969696967,
            "std_dev": 0.026243194054073906,
            "variance": 0.0006887052341597801,
            "min": 0.45454545454545453,
            "max": 0.5,
            "range": 0.04545454545454547
        },
        "step_accuracy_within_+-4": {
            "mean": 0.5303030303030303,
            "std_dev": 0.013121597027036937,
            "variance": 0.0001721763085399446,
            "min": 0.5227272727272727,
            "max": 0.5454545454545454,
            "range": 0.022727272727272707
        },
        "step_accuracy_within_+-5": {
            "mean": 0.5303030303030303,
            "std_dev": 0.013121597027036937,
            "variance": 0.0001721763085399446,
            "min": 0.5227272727272727,
            "max": 0.5454545454545454,
            "range": 0.022727272727272707
        },
        "total_prompt_tokens": {
            "mean": 1672714,
            "std_dev": 0.0,
            "variance": 0,
            "min": 1672714,
            "max": 1672714,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 59331.666666666664,
            "std_dev": 3278.2901539267896,
            "variance": 10747186.333333334,
            "min": 56942,
            "max": 63069,
            "range": 6127
        },
        "total_tokens": {
            "mean": 1732045.6666666667,
            "std_dev": 3278.2901539267896,
            "variance": 10747186.333333334,
            "min": 1729656,
            "max": 1735783,
            "range": 6127
        },
        "total_execution_time_sec": {
            "mean": 1455.3804,
            "std_dev": 282.0617244206133,
            "variance": 79558.81638313,
            "min": 1196.3355,
            "max": 1755.8725,
            "range": 559.537
        }
    }
}