{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 23,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.34782608695652173,
            "Average distance for incorrect cases": 0.21052631578947367,
            "Overall average distance": 0.2857142857142857,
            "Normalized average distance for correct cases": 0.009146570016135233,
            "Normalized average distance for incorrect cases": 0.007602339181286549,
            "Normalized overall average distance": 0.008447989400370353,
            "Correct step number predictions": 33,
            "Incorrect step number predictions": 9,
            "Step number accuracy": 0.7857142857142857,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 634392,
            "total_output_tokens": 89896,
            "total_tokens": 724288,
            "total_execution_time_sec": 922.2983
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 26,
            "Incorrect cases": 16,
            "Average distance for correct cases": 0.34615384615384615,
            "Average distance for incorrect cases": 0.0625,
            "Overall average distance": 0.23809523809523808,
            "Normalized average distance for correct cases": 0.009373247834786295,
            "Normalized average distance for incorrect cases": 0.003472222222222222,
            "Normalized overall average distance": 0.00712523807761903,
            "Correct step number predictions": 34,
            "Incorrect step number predictions": 8,
            "Step number accuracy": 0.8095238095238095,
            "Step accuracy within +-1": 0.9523809523809523,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 634392,
            "total_output_tokens": 87986,
            "total_tokens": 722378,
            "total_execution_time_sec": 874.2236
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 24,
            "Incorrect cases": 18,
            "Average distance for correct cases": 0.4166666666666667,
            "Average distance for incorrect cases": 0.05555555555555555,
            "Overall average distance": 0.2619047619047619,
            "Normalized average distance for correct cases": 0.011080277746944414,
            "Normalized average distance for incorrect cases": 0.0030864197530864196,
            "Normalized overall average distance": 0.007654338606719559,
            "Correct step number predictions": 34,
            "Incorrect step number predictions": 8,
            "Step number accuracy": 0.8095238095238095,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 634392,
            "total_output_tokens": 98284,
            "total_tokens": 732676,
            "total_execution_time_sec": 997.6733
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 73,
        "overall_incorrect_cases": 53,
        "overall_total_cases": 126,
        "overall_accuracy": 0.5793650793650794,
        "overall_avg_distance_for_correct_cases": 0.3698630136986301,
        "overall_avg_distance_for_incorrect_cases": 0.11320754716981132,
        "overall_avg_distance": 0.2619047619047619,
        "overall_normalized_avg_distance_for_correct_cases": 0.00986304410961945,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.004821802935010481,
        "overall_normalized_avg_distance": 0.007742522028236313,
        "overall_correct_step_number_predictions": 101,
        "overall_incorrect_step_number_predictions": 25,
        "overall_step_number_accuracy": 0.8015873015873016,
        "overall_step_accuracy_within_+-1": 0.9365079365079365,
        "overall_step_accuracy_within_+-2": 1.0,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 1903176,
        "grand_total_output_tokens": 276166,
        "grand_total_tokens": 2179342,
        "grand_total_execution_time_sec": 2794.1952,
        "avg_prompt_tokens_per_run": 634392.0,
        "avg_output_tokens_per_run": 92055.33333333333,
        "avg_tokens_per_run": 726447.3333333334,
        "avg_execution_time_per_run_sec": 931.3984
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.5793650793650794,
            "std_dev": 0.03636964837266539,
            "variance": 0.0013227513227513222,
            "min": 0.5476190476190477,
            "max": 0.6190476190476191,
            "range": 0.0714285714285714,
            "coefficient_of_variation": 0.062775009519943
        },
        "correct_cases": {
            "mean": 24.333333333333332,
            "std_dev": 1.5275252316519468,
            "variance": 2.3333333333333335,
            "min": 23,
            "max": 26,
            "range": 3
        },
        "incorrect_cases": {
            "mean": 17.666666666666668,
            "std_dev": 1.5275252316519468,
            "variance": 2.3333333333333335,
            "min": 16,
            "max": 19,
            "range": 3
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.3702155332590115,
            "std_dev": 0.04023654984547466,
            "variance": 0.0016189799434673665,
            "min": 0.34615384615384615,
            "max": 0.4166666666666667,
            "range": 0.07051282051282054
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.10952729044834307,
            "std_dev": 0.08753661329440526,
            "variance": 0.00766265866705425,
            "min": 0.05555555555555555,
            "max": 0.21052631578947367,
            "range": 0.15497076023391812
        },
        "overall_avg_distance": {
            "mean": 0.2619047619047619,
            "std_dev": 0.023809523809523808,
            "variance": 0.0005668934240362811,
            "min": 0.23809523809523808,
            "max": 0.2857142857142857,
            "range": 0.047619047619047616
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.009866698532621981,
            "std_dev": 0.0010570840034943728,
            "variance": 1.1174265904436913e-06,
            "min": 0.009146570016135233,
            "max": 0.011080277746944414,
            "range": 0.0019337077308091803
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.0047203270521983965,
            "std_dev": 0.002503339034261288,
            "variance": 6.266706320456238e-06,
            "min": 0.0030864197530864196,
            "max": 0.007602339181286549,
            "range": 0.004515919428200129
        },
        "normalized_overall_avg_distance": {
            "mean": 0.007742522028236314,
            "std_dev": 0.0006657702323871913,
            "variance": 4.432500023328947e-07,
            "min": 0.00712523807761903,
            "max": 0.008447989400370353,
            "range": 0.0013227513227513227
        },
        "correct_step_number_predictions": {
            "mean": 33.666666666666664,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 33,
            "max": 34,
            "range": 1
        },
        "incorrect_step_number_predictions": {
            "mean": 8.333333333333334,
            "std_dev": 0.5773502691896257,
            "variance": 0.3333333333333333,
            "min": 8,
            "max": 9,
            "range": 1
        },
        "step_number_accuracy": {
            "mean": 0.8015873015873016,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.7857142857142857,
            "max": 0.8095238095238095,
            "range": 0.023809523809523836
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9365079365079365,
            "std_dev": 0.013746434980705326,
            "variance": 0.00018896447467875904,
            "min": 0.9285714285714286,
            "max": 0.9523809523809523,
            "range": 0.023809523809523725
        },
        "step_accuracy_within_+-2": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 634392,
            "std_dev": 0.0,
            "variance": 0,
            "min": 634392,
            "max": 634392,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 92055.33333333333,
            "std_dev": 5478.069124548661,
            "variance": 30009241.333333332,
            "min": 87986,
            "max": 98284,
            "range": 10298
        },
        "total_tokens": {
            "mean": 726447.3333333334,
            "std_dev": 5478.069124548661,
            "variance": 30009241.333333332,
            "min": 722378,
            "max": 732676,
            "range": 10298
        },
        "total_execution_time_sec": {
            "mean": 931.3984,
            "std_dev": 62.22592685151424,
            "variance": 3872.06597253,
            "min": 874.2236,
            "max": 997.6733,
            "range": 123.4497
        }
    }
}