{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 10,
            "Incorrect cases": 34,
            "Average distance for correct cases": 11.5,
            "Average distance for incorrect cases": 33.0,
            "Overall average distance": 28.113636363636363,
            "Normalized average distance for correct cases": 0.21053038613426941,
            "Normalized average distance for incorrect cases": 0.4429427788188379,
            "Normalized overall average distance": 0.39012178048143603,
            "Correct step number predictions": 11,
            "Incorrect step number predictions": 33,
            "Step number accuracy": 0.25,
            "Step accuracy within +-1": 0.29545454545454547,
            "Step accuracy within +-2": 0.3409090909090909,
            "Step accuracy within +-3": 0.38636363636363635,
            "Step accuracy within +-4": 0.4090909090909091,
            "Step accuracy within +-5": 0.4318181818181818,
            "total_prompt_tokens": 1168968,
            "total_output_tokens": 100292,
            "total_tokens": 1269260,
            "total_execution_time_sec": 1609.8478
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 11,
            "Incorrect cases": 33,
            "Average distance for correct cases": 11.636363636363637,
            "Average distance for incorrect cases": 32.54545454545455,
            "Overall average distance": 27.318181818181817,
            "Normalized average distance for correct cases": 0.18240071705531233,
            "Normalized average distance for incorrect cases": 0.44306340441999376,
            "Normalized overall average distance": 0.37789773257882336,
            "Correct step number predictions": 11,
            "Incorrect step number predictions": 33,
            "Step number accuracy": 0.25,
            "Step accuracy within +-1": 0.29545454545454547,
            "Step accuracy within +-2": 0.3181818181818182,
            "Step accuracy within +-3": 0.36363636363636365,
            "Step accuracy within +-4": 0.4318181818181818,
            "Step accuracy within +-5": 0.45454545454545453,
            "total_prompt_tokens": 1168968,
            "total_output_tokens": 107631,
            "total_tokens": 1276599,
            "total_execution_time_sec": 1608.6988
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 12,
            "Incorrect cases": 32,
            "Average distance for correct cases": 24.333333333333332,
            "Average distance for incorrect cases": 31.375,
            "Overall average distance": 29.454545454545453,
            "Normalized average distance for correct cases": 0.3080106108952963,
            "Normalized average distance for incorrect cases": 0.43101323314925366,
            "Normalized overall average distance": 0.39746706344362887,
            "Correct step number predictions": 10,
            "Incorrect step number predictions": 34,
            "Step number accuracy": 0.22727272727272727,
            "Step accuracy within +-1": 0.2727272727272727,
            "Step accuracy within +-2": 0.29545454545454547,
            "Step accuracy within +-3": 0.3409090909090909,
            "Step accuracy within +-4": 0.38636363636363635,
            "Step accuracy within +-5": 0.4090909090909091,
            "total_prompt_tokens": 1168968,
            "total_output_tokens": 106647,
            "total_tokens": 1275615,
            "total_execution_time_sec": 1592.0629
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 33,
        "overall_incorrect_cases": 99,
        "overall_total_cases": 132,
        "overall_accuracy": 0.25,
        "overall_avg_distance_for_correct_cases": 16.21212121212121,
        "overall_avg_distance_for_incorrect_cases": 32.323232323232325,
        "overall_avg_distance": 28.295454545454547,
        "overall_normalized_avg_distance_for_correct_cases": 0.2366011842331723,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.43912697259067074,
        "overall_normalized_avg_distance": 0.3884955255012961,
        "overall_correct_step_number_predictions": 32,
        "overall_incorrect_step_number_predictions": 100,
        "overall_step_number_accuracy": 0.24242424242424243,
        "overall_step_accuracy_within_+-1": 0.2878787878787879,
        "overall_step_accuracy_within_+-2": 0.3181818181818182,
        "overall_step_accuracy_within_+-3": 0.36363636363636365,
        "overall_step_accuracy_within_+-4": 0.4090909090909091,
        "overall_step_accuracy_within_+-5": 0.4318181818181818,
        "grand_total_prompt_tokens": 3506904,
        "grand_total_output_tokens": 314570,
        "grand_total_tokens": 3821474,
        "grand_total_execution_time_sec": 4810.6095,
        "avg_prompt_tokens_per_run": 1168968.0,
        "avg_output_tokens_per_run": 104856.66666666667,
        "avg_tokens_per_run": 1273824.6666666667,
        "avg_execution_time_per_run_sec": 1603.5365
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.25,
            "std_dev": 0.02272727272727272,
            "variance": 0.0005165289256198344,
            "min": 0.22727272727272727,
            "max": 0.2727272727272727,
            "range": 0.04545454545454544,
            "coefficient_of_variation": 0.09090909090909088
        },
        "correct_cases": {
            "mean": 11,
            "std_dev": 1.0,
            "variance": 1,
            "min": 10,
            "max": 12,
            "range": 2
        },
        "incorrect_cases": {
            "mean": 33,
            "std_dev": 1.0,
            "variance": 1,
            "min": 32,
            "max": 34,
            "range": 2
        },
        "avg_distance_for_correct_cases": {
            "mean": 15.823232323232324,
            "std_dev": 7.370279042337694,
            "variance": 54.32101316192224,
            "min": 11.5,
            "max": 24.333333333333332,
            "range": 12.833333333333332
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 32.30681818181818,
            "std_dev": 0.8383714782464814,
            "variance": 0.7028667355371904,
            "min": 31.375,
            "max": 33.0,
            "range": 1.625
        },
        "overall_avg_distance": {
            "mean": 28.295454545454543,
            "std_dev": 1.0797248654745184,
            "variance": 1.1658057851239672,
            "min": 27.318181818181817,
            "max": 29.454545454545453,
            "range": 2.1363636363636367
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.23364723802829268,
            "std_dev": 0.06591853296866267,
            "variance": 0.004345252988740666,
            "min": 0.18240071705531233,
            "max": 0.3080106108952963,
            "range": 0.12560989383998397
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.4390064721293618,
            "std_dev": 0.006922610755975918,
            "variance": 4.792253967875347e-05,
            "min": 0.43101323314925366,
            "max": 0.44306340441999376,
            "range": 0.012050171270740095
        },
        "normalized_overall_avg_distance": {
            "mean": 0.38849552550129607,
            "std_dev": 0.009885504871749332,
            "variance": 9.772320656937978e-05,
            "min": 0.37789773257882336,
            "max": 0.39746706344362887,
            "range": 0.0195693308648055
        },
        "correct_step_number_predictions": {
            "mean": 10.666666666666666,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 10,
            "max": 11,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 33.333333333333336,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 33,
            "max": 34,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.24242424242424243,
            "std_dev": 0.013121597027036953,
            "variance": 0.00017217630853994502,
            "min": 0.22727272727272727,
            "max": 0.25,
            "range": 0.022727272727272735
        },
        "step_accuracy_within_+-1": {
            "mean": 0.2878787878787879,
            "std_dev": 0.01312159702703697,
            "variance": 0.00017217630853994545,
            "min": 0.2727272727272727,
            "max": 0.29545454545454547,
            "range": 0.022727272727272763
        },
        "step_accuracy_within_+-2": {
            "mean": 0.3181818181818182,
            "std_dev": 0.022727272727272707,
            "variance": 0.0005165289256198337,
            "min": 0.29545454545454547,
            "max": 0.3409090909090909,
            "range": 0.045454545454545414
        },
        "step_accuracy_within_+-3": {
            "mean": 0.36363636363636365,
            "std_dev": 0.022727272727272735,
            "variance": 0.000516528925619835,
            "min": 0.3409090909090909,
            "max": 0.38636363636363635,
            "range": 0.04545454545454547
        },
        "step_accuracy_within_+-4": {
            "mean": 0.4090909090909091,
            "std_dev": 0.022727272727272735,
            "variance": 0.000516528925619835,
            "min": 0.38636363636363635,
            "max": 0.4318181818181818,
            "range": 0.04545454545454547
        },
        "step_accuracy_within_+-5": {
            "mean": 0.4318181818181818,
            "std_dev": 0.022727272727272707,
            "variance": 0.0005165289256198337,
            "min": 0.4090909090909091,
            "max": 0.45454545454545453,
            "range": 0.045454545454545414
        },
        "total_prompt_tokens": {
            "mean": 1168968,
            "std_dev": 0.0,
            "variance": 0,
            "min": 1168968,
            "max": 1168968,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 104856.66666666667,
            "std_dev": 3983.6164892385577,
            "variance": 15869200.333333334,
            "min": 100292,
            "max": 107631,
            "range": 7339
        },
        "total_tokens": {
            "mean": 1273824.6666666667,
            "std_dev": 3983.6164892385577,
            "variance": 15869200.333333334,
            "min": 1269260,
            "max": 1276599,
            "range": 7339
        },
        "total_execution_time_sec": {
            "mean": 1603.5365,
            "std_dev": 9.953023308020565,
            "variance": 99.06267297000063,
            "min": 1592.0629,
            "max": 1609.8478,
            "range": 17.784900000000107
        }
    }
}