{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 26,
            "Incorrect cases": 16,
            "Average distance for correct cases": 0.38461538461538464,
            "Average distance for incorrect cases": 0.3125,
            "Overall average distance": 0.35714285714285715,
            "Normalized average distance for correct cases": 0.01126579972733819,
            "Normalized average distance for incorrect cases": 0.013888888888888888,
            "Normalized overall average distance": 0.012265071788881313,
            "Correct step number predictions": 31,
            "Incorrect step number predictions": 11,
            "Step number accuracy": 0.7380952380952381,
            "Step accuracy within +-1": 0.9047619047619048,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 680310,
            "total_output_tokens": 90579,
            "total_tokens": 770889,
            "total_execution_time_sec": 941.1691
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 25,
            "Incorrect cases": 17,
            "Average distance for correct cases": 0.4,
            "Average distance for incorrect cases": 0.11764705882352941,
            "Overall average distance": 0.2857142857142857,
            "Normalized average distance for correct cases": 0.011716431716431717,
            "Normalized average distance for incorrect cases": 0.00326797385620915,
            "Normalized overall average distance": 0.008296817820627345,
            "Correct step number predictions": 33,
            "Incorrect step number predictions": 9,
            "Step number accuracy": 0.7857142857142857,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 680310,
            "total_output_tokens": 92808,
            "total_tokens": 773118,
            "total_execution_time_sec": 927.4126
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 25,
            "Incorrect cases": 17,
            "Average distance for correct cases": 0.32,
            "Average distance for incorrect cases": 0.23529411764705882,
            "Overall average distance": 0.2857142857142857,
            "Normalized average distance for correct cases": 0.009474007474007475,
            "Normalized average distance for incorrect cases": 0.00849673202614379,
            "Normalized overall average distance": 0.009078443602253126,
            "Correct step number predictions": 32,
            "Incorrect step number predictions": 10,
            "Step number accuracy": 0.7619047619047619,
            "Step accuracy within +-1": 0.9523809523809523,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 680310,
            "total_output_tokens": 90843,
            "total_tokens": 771153,
            "total_execution_time_sec": 885.8799
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 76,
        "overall_incorrect_cases": 50,
        "overall_total_cases": 126,
        "overall_accuracy": 0.6031746031746031,
        "overall_avg_distance_for_correct_cases": 0.3684210526315789,
        "overall_avg_distance_for_incorrect_cases": 0.22,
        "overall_avg_distance": 0.30952380952380953,
        "overall_normalized_avg_distance_for_correct_cases": 0.010824628587786484,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.008444444444444444,
        "overall_normalized_avg_distance": 0.009880111070587262,
        "overall_correct_step_number_predictions": 96,
        "overall_incorrect_step_number_predictions": 30,
        "overall_step_number_accuracy": 0.7619047619047619,
        "overall_step_accuracy_within_+-1": 0.9285714285714286,
        "overall_step_accuracy_within_+-2": 1.0,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 2040930,
        "grand_total_output_tokens": 274230,
        "grand_total_tokens": 2315160,
        "grand_total_execution_time_sec": 2754.4616,
        "avg_prompt_tokens_per_run": 680310.0,
        "avg_output_tokens_per_run": 91410.0,
        "avg_tokens_per_run": 771720.0,
        "avg_execution_time_per_run_sec": 918.1539
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.6031746031746031,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.5952380952380952,
            "max": 0.6190476190476191,
            "range": 0.023809523809523836,
            "coefficient_of_variation": 0.022790142204853675
        },
        "correct_cases": {
            "mean": 25.333333333333332,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 25,
            "max": 26,
            "range": 1
        },
        "incorrect_cases": {
            "mean": 16.666666666666668,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 16,
            "max": 17,
            "range": 1
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.36820512820512824,
            "std_dev": 0.04244964530579463,
            "variance": 0.001801972386587772,
            "min": 0.32,
            "max": 0.4,
            "range": 0.08000000000000002
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.22181372549019607,
            "std_dev": 0.09812343196610777,
            "variance": 0.009628207900807382,
            "min": 0.11764705882352941,
            "max": 0.3125,
            "range": 0.1948529411764706
        },
        "overall_avg_distance": {
            "mean": 0.30952380952380953,
            "std_dev": 0.04123930494211614,
            "variance": 0.0017006802721088448,
            "min": 0.2857142857142857,
            "max": 0.35714285714285715,
            "range": 0.07142857142857145
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.010818746305925794,
            "std_dev": 0.0011861741828038497,
            "variance": 1.4070091919503805e-06,
            "min": 0.009474007474007475,
            "max": 0.011716431716431717,
            "range": 0.002242424242424242
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.00855119825708061,
            "std_dev": 0.005310666997711771,
            "variance": 2.820318396058496e-05,
            "min": 0.00326797385620915,
            "max": 0.013888888888888888,
            "range": 0.010620915032679739
        },
        "normalized_overall_avg_distance": {
            "mean": 0.009880111070587262,
            "std_dev": 0.0021020853780186447,
            "variance": 4.418762936479788e-06,
            "min": 0.008296817820627345,
            "max": 0.012265071788881313,
            "range": 0.003968253968253968
        },
        "correct_step_number_predictions": {
            "mean": 32,
            "std_dev": 1.0,
            "variance": 1,
            "min": 31,
            "max": 33,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 10,
            "std_dev": 1.0,
            "variance": 1,
            "min": 9,
            "max": 11,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.7619047619047619,
            "std_dev": 0.02380952380952378,
            "variance": 0.0005668934240362798,
            "min": 0.7380952380952381,
            "max": 0.7857142857142857,
            "range": 0.04761904761904756
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9285714285714286,
            "std_dev": 0.02380952380952378,
            "variance": 0.0005668934240362798,
            "min": 0.9047619047619048,
            "max": 0.9523809523809523,
            "range": 0.04761904761904756
        },
        "step_accuracy_within_+-2": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 680310,
            "std_dev": 0.0,
            "variance": 0,
            "min": 680310,
            "max": 680310,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 91410,
            "std_dev": 1217.8780727149988,
            "variance": 1483227,
            "min": 90579,
            "max": 92808,
            "range": 2229
        },
        "total_tokens": {
            "mean": 771720,
            "std_dev": 1217.8780727149988,
            "variance": 1483227,
            "min": 770889,
            "max": 773118,
            "range": 2229
        },
        "total_execution_time_sec": {
            "mean": 918.1538666666667,
            "std_dev": 28.78397151824834,
            "variance": 828.5170163633317,
            "min": 885.8799,
            "max": 941.1691,
            "range": 55.28919999999994
        }
    }
}