{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 21,
            "Incorrect cases": 21,
            "Average distance for correct cases": 0.19047619047619047,
            "Average distance for incorrect cases": 0.23809523809523808,
            "Overall average distance": 0.21428571428571427,
            "Normalized average distance for correct cases": 0.005132081322557513,
            "Normalized average distance for incorrect cases": 0.007455507455507456,
            "Normalized overall average distance": 0.006293794389032484,
            "Correct step number predictions": 35,
            "Incorrect step number predictions": 7,
            "Step number accuracy": 0.8333333333333334,
            "Step accuracy within +-1": 0.9523809523809523,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 344775,
            "total_output_tokens": 108298,
            "total_tokens": 453073,
            "total_execution_time_sec": 1015.5756
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 24,
            "Incorrect cases": 18,
            "Average distance for correct cases": 0.25,
            "Average distance for incorrect cases": 0.2777777777777778,
            "Overall average distance": 0.2619047619047619,
            "Normalized average distance for correct cases": 0.00660697327363994,
            "Normalized average distance for incorrect cases": 0.008698092031425365,
            "Normalized overall average distance": 0.007503167026976552,
            "Correct step number predictions": 34,
            "Incorrect step number predictions": 8,
            "Step number accuracy": 0.8095238095238095,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 344775,
            "total_output_tokens": 101567,
            "total_tokens": 446342,
            "total_execution_time_sec": 914.5752
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 23,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.2608695652173913,
            "Average distance for incorrect cases": 0.2631578947368421,
            "Overall average distance": 0.2619047619047619,
            "Normalized average distance for correct cases": 0.007659409833322877,
            "Normalized average distance for incorrect cases": 0.008073948311639313,
            "Normalized overall average distance": 0.007846939144942217,
            "Correct step number predictions": 33,
            "Incorrect step number predictions": 9,
            "Step number accuracy": 0.7857142857142857,
            "Step accuracy within +-1": 0.9523809523809523,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 344775,
            "total_output_tokens": 98855,
            "total_tokens": 443630,
            "total_execution_time_sec": 918.4754
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 68,
        "overall_incorrect_cases": 58,
        "overall_total_cases": 126,
        "overall_accuracy": 0.5396825396825397,
        "overall_avg_distance_for_correct_cases": 0.23529411764705882,
        "overall_avg_distance_for_incorrect_cases": 0.25862068965517243,
        "overall_avg_distance": 0.24603174603174602,
        "overall_normalized_avg_distance_for_correct_cases": 0.006507463125110184,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.00804372984573207,
        "overall_normalized_avg_distance": 0.007214633520317085,
        "overall_correct_step_number_predictions": 102,
        "overall_incorrect_step_number_predictions": 24,
        "overall_step_number_accuracy": 0.8095238095238095,
        "overall_step_accuracy_within_+-1": 0.9444444444444444,
        "overall_step_accuracy_within_+-2": 1.0,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 1034325,
        "grand_total_output_tokens": 308720,
        "grand_total_tokens": 1343045,
        "grand_total_execution_time_sec": 2848.6262,
        "avg_prompt_tokens_per_run": 344775.0,
        "avg_output_tokens_per_run": 102906.66666666667,
        "avg_tokens_per_run": 447681.6666666667,
        "avg_execution_time_per_run_sec": 949.5421
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.5396825396825397,
            "std_dev": 0.03636964837266539,
            "variance": 0.0013227513227513222,
            "min": 0.5,
            "max": 0.5714285714285714,
            "range": 0.0714285714285714,
            "coefficient_of_variation": 0.06739081904346822
        },
        "correct_cases": {
            "mean": 22.666666666666668,
            "std_dev": 1.5275252316519468,
            "variance": 2.3333333333333335,
            "min": 21,
            "max": 24,
            "range": 3
        },
        "incorrect_cases": {
            "mean": 19.333333333333332,
            "std_dev": 1.5275252316519468,
            "variance": 2.3333333333333335,
            "min": 18,
            "max": 21,
            "range": 3
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.23378191856452726,
            "std_dev": 0.03789559903141131,
            "variance": 0.0014360764259495021,
            "min": 0.19047619047619047,
            "max": 0.2608695652173913,
            "range": 0.07039337474120083
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.259676970203286,
            "std_dev": 0.02006897146393704,
            "variance": 0.00040276361562031925,
            "min": 0.23809523809523808,
            "max": 0.2777777777777778,
            "range": 0.03968253968253971
        },
        "overall_avg_distance": {
            "mean": 0.24603174603174605,
            "std_dev": 0.027492869961410767,
            "variance": 0.0007558578987150424,
            "min": 0.21428571428571427,
            "max": 0.2619047619047619,
            "range": 0.047619047619047644
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.00646615480984011,
            "std_dev": 0.0012695352417881418,
            "variance": 1.611719730142076e-06,
            "min": 0.005132081322557513,
            "max": 0.007659409833322877,
            "range": 0.0025273285107653637
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.008075849266190711,
            "std_dev": 0.0006212944690711705,
            "variance": 3.8600681729842754e-07,
            "min": 0.007455507455507456,
            "max": 0.008698092031425365,
            "range": 0.0012425845759179094
        },
        "normalized_overall_avg_distance": {
            "mean": 0.007214633520317084,
            "std_dev": 0.0008157838847092914,
            "variance": 6.655033465513824e-07,
            "min": 0.006293794389032484,
            "max": 0.007846939144942217,
            "range": 0.0015531447559097326
        },
        "correct_step_number_predictions": {
            "mean": 34,
            "std_dev": 1.0,
            "variance": 1,
            "min": 33,
            "max": 35,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 8,
            "std_dev": 1.0,
            "variance": 1,
            "min": 7,
            "max": 9,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.8095238095238095,
            "std_dev": 0.023809523809523836,
            "variance": 0.0005668934240362824,
            "min": 0.7857142857142857,
            "max": 0.8333333333333334,
            "range": 0.04761904761904767
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9444444444444444,
            "std_dev": 0.013746434980705326,
            "variance": 0.00018896447467875904,
            "min": 0.9285714285714286,
            "max": 0.9523809523809523,
            "range": 0.023809523809523725
        },
        "step_accuracy_within_+-2": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 344775,
            "std_dev": 0.0,
            "variance": 0,
            "min": 344775,
            "max": 344775,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 102906.66666666667,
            "std_dev": 4861.953551128737,
            "variance": 23638592.333333332,
            "min": 98855,
            "max": 108298,
            "range": 9443
        },
        "total_tokens": {
            "mean": 447681.6666666667,
            "std_dev": 4861.953551128737,
            "variance": 23638592.333333332,
            "min": 443630,
            "max": 453073,
            "range": 9443
        },
        "total_execution_time_sec": {
            "mean": 949.5420666666666,
            "std_dev": 57.219957474410386,
            "variance": 3274.123533373333,
            "min": 914.5752,
            "max": 1015.5756,
            "range": 101.00040000000001
        }
    }
}