{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 23,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.21739130434782608,
            "Average distance for incorrect cases": 0.21052631578947367,
            "Overall average distance": 0.21428571428571427,
            "Normalized average distance for correct cases": 0.0054509902335989294,
            "Normalized average distance for incorrect cases": 0.007351712614870509,
            "Normalized overall average distance": 0.006310840834650359,
            "Correct step number predictions": 34,
            "Incorrect step number predictions": 8,
            "Step number accuracy": 0.8095238095238095,
            "Step accuracy within +-1": 0.9761904761904762,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 373797,
            "total_output_tokens": 115087,
            "total_tokens": 488884,
            "total_execution_time_sec": 1362.3636
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 25,
            "Incorrect cases": 17,
            "Average distance for correct cases": 0.24,
            "Average distance for incorrect cases": 0.11764705882352941,
            "Overall average distance": 0.19047619047619047,
            "Normalized average distance for correct cases": 0.007046657046657047,
            "Normalized average distance for incorrect cases": 0.00326797385620915,
            "Normalized overall average distance": 0.005517190040999565,
            "Correct step number predictions": 35,
            "Incorrect step number predictions": 7,
            "Step number accuracy": 0.8333333333333334,
            "Step accuracy within +-1": 0.9761904761904762,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 373797,
            "total_output_tokens": 111765,
            "total_tokens": 485562,
            "total_execution_time_sec": 1363.3965
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 25,
            "Incorrect cases": 17,
            "Average distance for correct cases": 0.24,
            "Average distance for incorrect cases": 0.0,
            "Overall average distance": 0.14285714285714285,
            "Normalized average distance for correct cases": 0.007046657046657047,
            "Normalized average distance for incorrect cases": 0.0,
            "Normalized overall average distance": 0.004194438718248242,
            "Correct step number predictions": 36,
            "Incorrect step number predictions": 6,
            "Step number accuracy": 0.8571428571428571,
            "Step accuracy within +-1": 1.0,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 373797,
            "total_output_tokens": 112072,
            "total_tokens": 485869,
            "total_execution_time_sec": 1305.1267
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 73,
        "overall_incorrect_cases": 53,
        "overall_total_cases": 126,
        "overall_accuracy": 0.5793650793650794,
        "overall_avg_distance_for_correct_cases": 0.2328767123287671,
        "overall_avg_distance_for_incorrect_cases": 0.11320754716981132,
        "overall_avg_distance": 0.18253968253968253,
        "overall_normalized_avg_distance_for_correct_cases": 0.00654391270829627,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.003683737646001797,
        "overall_normalized_avg_distance": 0.005340823197966056,
        "overall_correct_step_number_predictions": 105,
        "overall_incorrect_step_number_predictions": 21,
        "overall_step_number_accuracy": 0.8333333333333334,
        "overall_step_accuracy_within_+-1": 0.9841269841269841,
        "overall_step_accuracy_within_+-2": 1.0,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 1121391,
        "grand_total_output_tokens": 338924,
        "grand_total_tokens": 1460315,
        "grand_total_execution_time_sec": 4030.8868,
        "avg_prompt_tokens_per_run": 373797.0,
        "avg_output_tokens_per_run": 112974.66666666667,
        "avg_tokens_per_run": 486771.6666666667,
        "avg_execution_time_per_run_sec": 1343.6289
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.5793650793650794,
            "std_dev": 0.027492869961410718,
            "variance": 0.0007558578987150397,
            "min": 0.5476190476190477,
            "max": 0.5952380952380952,
            "range": 0.04761904761904756,
            "coefficient_of_variation": 0.04745344678270891
        },
        "correct_cases": {
            "mean": 24.333333333333332,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 23,
            "max": 25,
            "range": 2
        },
        "incorrect_cases": {
            "mean": 17.666666666666668,
            "std_dev": 1.1547005383792515,
            "variance": 1.3333333333333333,
            "min": 17,
            "max": 19,
            "range": 2
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.23246376811594202,
            "std_dev": 0.013053136520808929,
            "variance": 0.00017038437303087582,
            "min": 0.21739130434782608,
            "max": 0.24,
            "range": 0.02260869565217391
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.10939112487100103,
            "std_dev": 0.10550570005475382,
            "variance": 0.01113145274404368,
            "min": 0.0,
            "max": 0.21052631578947367,
            "range": 0.21052631578947367
        },
        "overall_avg_distance": {
            "mean": 0.18253968253968253,
            "std_dev": 0.036369648372665396,
            "variance": 0.0013227513227513227,
            "min": 0.14285714285714285,
            "max": 0.21428571428571427,
            "range": 0.07142857142857142
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.006514768108971007,
            "std_dev": 0.0009212586640560562,
            "variance": 8.487175260983495e-07,
            "min": 0.0054509902335989294,
            "max": 0.007046657046657047,
            "range": 0.0015956668130581173
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.0035398954903598863,
            "std_dev": 0.0036833918640637397,
            "variance": 1.3567375624250951e-05,
            "min": 0.0,
            "max": 0.007351712614870509,
            "range": 0.007351712614870509
        },
        "normalized_overall_avg_distance": {
            "mean": 0.005340823197966056,
            "std_dev": 0.001069167165165974,
            "variance": 1.143118427069045e-06,
            "min": 0.004194438718248242,
            "max": 0.006310840834650359,
            "range": 0.002116402116402117
        },
        "correct_step_number_predictions": {
            "mean": 35,
            "std_dev": 1.0,
            "variance": 1,
            "min": 34,
            "max": 36,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 7,
            "std_dev": 1.0,
            "variance": 1,
            "min": 6,
            "max": 8,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.8333333333333334,
            "std_dev": 0.02380952380952378,
            "variance": 0.0005668934240362798,
            "min": 0.8095238095238095,
            "max": 0.8571428571428571,
            "range": 0.04761904761904756
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9841269841269841,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.9761904761904762,
            "max": 1.0,
            "range": 0.023809523809523836
        },
        "step_accuracy_within_+-2": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 373797,
            "std_dev": 0.0,
            "variance": 0,
            "min": 373797,
            "max": 373797,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 112974.66666666667,
            "std_dev": 1835.7631473949284,
            "variance": 3370026.3333333335,
            "min": 111765,
            "max": 115087,
            "range": 3322
        },
        "total_tokens": {
            "mean": 486771.6666666667,
            "std_dev": 1835.7631473949284,
            "variance": 3370026.3333333335,
            "min": 485562,
            "max": 488884,
            "range": 3322
        },
        "total_execution_time_sec": {
            "mean": 1343.6289333333334,
            "std_dev": 33.347911468986055,
            "variance": 1112.0831993433317,
            "min": 1305.1267,
            "max": 1363.3965,
            "range": 58.26980000000003
        }
    }
}