{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 19,
            "Incorrect cases": 25,
            "Average distance for correct cases": 6.105263157894737,
            "Average distance for incorrect cases": 18.64,
            "Overall average distance": 13.227272727272727,
            "Normalized average distance for correct cases": 0.14000566179729929,
            "Normalized average distance for incorrect cases": 0.2881545659540878,
            "Normalized overall average distance": 0.2241811755227473,
            "Correct step number predictions": 12,
            "Incorrect step number predictions": 32,
            "Step number accuracy": 0.2727272727272727,
            "Step accuracy within +-1": 0.4090909090909091,
            "Step accuracy within +-2": 0.4318181818181818,
            "Step accuracy within +-3": 0.4772727272727273,
            "Step accuracy within +-4": 0.5,
            "Step accuracy within +-5": 0.5,
            "total_prompt_tokens": 892370,
            "total_output_tokens": 54216,
            "total_tokens": 946586,
            "total_execution_time_sec": 1212.6035
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 14,
            "Incorrect cases": 30,
            "Average distance for correct cases": 0.8571428571428571,
            "Average distance for incorrect cases": 16.7,
            "Overall average distance": 11.659090909090908,
            "Normalized average distance for correct cases": 0.024239527352900647,
            "Normalized average distance for incorrect cases": 0.2976230124857893,
            "Normalized overall average distance": 0.21063735812532478,
            "Correct step number predictions": 14,
            "Incorrect step number predictions": 30,
            "Step number accuracy": 0.3181818181818182,
            "Step accuracy within +-1": 0.4090909090909091,
            "Step accuracy within +-2": 0.4318181818181818,
            "Step accuracy within +-3": 0.4772727272727273,
            "Step accuracy within +-4": 0.4772727272727273,
            "Step accuracy within +-5": 0.4772727272727273,
            "total_prompt_tokens": 892370,
            "total_output_tokens": 61472,
            "total_tokens": 953842,
            "total_execution_time_sec": 1157.8286
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 17,
            "Incorrect cases": 27,
            "Average distance for correct cases": 4.176470588235294,
            "Average distance for incorrect cases": 22.48148148148148,
            "Overall average distance": 15.409090909090908,
            "Normalized average distance for correct cases": 0.10594586742352889,
            "Normalized average distance for incorrect cases": 0.3283750609384523,
            "Normalized overall average distance": 0.24243650889859555,
            "Correct step number predictions": 13,
            "Incorrect step number predictions": 31,
            "Step number accuracy": 0.29545454545454547,
            "Step accuracy within +-1": 0.4090909090909091,
            "Step accuracy within +-2": 0.4318181818181818,
            "Step accuracy within +-3": 0.4772727272727273,
            "Step accuracy within +-4": 0.4772727272727273,
            "Step accuracy within +-5": 0.4772727272727273,
            "total_prompt_tokens": 892370,
            "total_output_tokens": 61602,
            "total_tokens": 953972,
            "total_execution_time_sec": 961.9765
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 50,
        "overall_incorrect_cases": 82,
        "overall_total_cases": 132,
        "overall_accuracy": 0.3787878787878788,
        "overall_avg_distance_for_correct_cases": 3.98,
        "overall_avg_distance_for_incorrect_cases": 19.195121951219512,
        "overall_avg_distance": 13.431818181818182,
        "overall_normalized_avg_distance_for_correct_cases": 0.09601081406578572,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.30486196547273275,
        "overall_normalized_avg_distance": 0.2257516808488892,
        "overall_correct_step_number_predictions": 39,
        "overall_incorrect_step_number_predictions": 93,
        "overall_step_number_accuracy": 0.29545454545454547,
        "overall_step_accuracy_within_+-1": 0.4090909090909091,
        "overall_step_accuracy_within_+-2": 0.4318181818181818,
        "overall_step_accuracy_within_+-3": 0.4772727272727273,
        "overall_step_accuracy_within_+-4": 0.48484848484848486,
        "overall_step_accuracy_within_+-5": 0.48484848484848486,
        "grand_total_prompt_tokens": 2677110,
        "grand_total_output_tokens": 177290,
        "grand_total_tokens": 2854400,
        "grand_total_execution_time_sec": 3332.4086,
        "avg_prompt_tokens_per_run": 892370.0,
        "avg_output_tokens_per_run": 59096.666666666664,
        "avg_tokens_per_run": 951466.6666666666,
        "avg_execution_time_per_run_sec": 1110.8029
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.3787878787878788,
            "std_dev": 0.057195715418717805,
            "variance": 0.0032713498622589537,
            "min": 0.3181818181818182,
            "max": 0.4318181818181818,
            "range": 0.11363636363636365,
            "coefficient_of_variation": 0.150996688705415
        },
        "correct_cases": {
            "mean": 16.666666666666668,
            "std_dev": 2.516611478423583,
            "variance": 6.333333333333333,
            "min": 14,
            "max": 19,
            "range": 5
        },
        "incorrect_cases": {
            "mean": 27.333333333333332,
            "std_dev": 2.516611478423583,
            "variance": 6.333333333333333,
            "min": 25,
            "max": 30,
            "range": 5
        },
        "avg_distance_for_correct_cases": {
            "mean": 3.7129588677576297,
            "std_dev": 2.654585468402953,
            "variance": 7.046824009056125,
            "min": 0.8571428571428571,
            "max": 6.105263157894737,
            "range": 5.24812030075188
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 19.273827160493827,
            "std_dev": 2.9423943791091425,
            "variance": 8.657684682213077,
            "min": 16.7,
            "max": 22.48148148148148,
            "range": 5.781481481481482
        },
        "overall_avg_distance": {
            "mean": 13.431818181818182,
            "std_dev": 1.883349179581791,
            "variance": 3.5470041322314048,
            "min": 11.659090909090908,
            "max": 15.409090909090908,
            "range": 3.75
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.09006368552457628,
            "std_dev": 0.05949480857564977,
            "variance": 0.003539632247453209,
            "min": 0.024239527352900647,
            "max": 0.14000566179729929,
            "range": 0.11576613444439864
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.30471754645944316,
            "std_dev": 0.021027871110131886,
            "variance": 0.0004421713634243193,
            "min": 0.2881545659540878,
            "max": 0.3283750609384523,
            "range": 0.04022049498436453
        },
        "normalized_overall_avg_distance": {
            "mean": 0.22575168084888922,
            "std_dev": 0.01595764276796175,
            "variance": 0.00025464636270988196,
            "min": 0.21063735812532478,
            "max": 0.24243650889859555,
            "range": 0.03179915077327078
        },
        "correct_step_number_predictions": {
            "mean": 13,
            "std_dev": 1.0,
            "variance": 1,
            "min": 12,
            "max": 14,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 31,
            "std_dev": 1.0,
            "variance": 1,
            "min": 30,
            "max": 32,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.29545454545454547,
            "std_dev": 0.022727272727272735,
            "variance": 0.000516528925619835,
            "min": 0.2727272727272727,
            "max": 0.3181818181818182,
            "range": 0.04545454545454547
        },
        "step_accuracy_within_+-1": {
            "mean": 0.4090909090909091,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.4090909090909091,
            "max": 0.4090909090909091,
            "range": 0.0
        },
        "step_accuracy_within_+-2": {
            "mean": 0.4318181818181818,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.4318181818181818,
            "max": 0.4318181818181818,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 0.4772727272727273,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.4772727272727273,
            "max": 0.4772727272727273,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 0.48484848484848486,
            "std_dev": 0.013121597027036937,
            "variance": 0.0001721763085399446,
            "min": 0.4772727272727273,
            "max": 0.5,
            "range": 0.022727272727272707
        },
        "step_accuracy_within_+-5": {
            "mean": 0.48484848484848486,
            "std_dev": 0.013121597027036937,
            "variance": 0.0001721763085399446,
            "min": 0.4772727272727273,
            "max": 0.5,
            "range": 0.022727272727272707
        },
        "total_prompt_tokens": {
            "mean": 892370,
            "std_dev": 0.0,
            "variance": 0,
            "min": 892370,
            "max": 892370,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 59096.666666666664,
            "std_dev": 4227.281080473988,
            "variance": 17869905.333333332,
            "min": 54216,
            "max": 61602,
            "range": 7386
        },
        "total_tokens": {
            "mean": 951466.6666666666,
            "std_dev": 4227.281080473988,
            "variance": 17869905.333333332,
            "min": 946586,
            "max": 953972,
            "range": 7386
        },
        "total_execution_time_sec": {
            "mean": 1110.8028666666667,
            "std_dev": 131.76508634271573,
            "variance": 17362.037978903332,
            "min": 961.9765,
            "max": 1212.6035,
            "range": 250.62699999999995
        }
    }
}