{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 22,
            "Incorrect cases": 20,
            "Average distance for correct cases": 0.4090909090909091,
            "Average distance for incorrect cases": 0.2,
            "Overall average distance": 0.30952380952380953,
            "Normalized average distance for correct cases": 0.012015425651789287,
            "Normalized average distance for incorrect cases": 0.009761904761904762,
            "Normalized overall average distance": 0.010942320466129989,
            "Correct step number predictions": 32,
            "Incorrect step number predictions": 10,
            "Step number accuracy": 0.7619047619047619,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 651288,
            "total_output_tokens": 91752,
            "total_tokens": 743040,
            "total_execution_time_sec": 854.2543
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 23,
            "Incorrect cases": 19,
            "Average distance for correct cases": 0.34782608695652173,
            "Average distance for incorrect cases": 0.2631578947368421,
            "Overall average distance": 0.30952380952380953,
            "Normalized average distance for correct cases": 0.01029783421087769,
            "Normalized average distance for incorrect cases": 0.013450292397660818,
            "Normalized overall average distance": 0.011723946247755771,
            "Correct step number predictions": 32,
            "Incorrect step number predictions": 10,
            "Step number accuracy": 0.7619047619047619,
            "Step accuracy within +-1": 0.9523809523809523,
            "Step accuracy within +-2": 0.9761904761904762,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 651288,
            "total_output_tokens": 94338,
            "total_tokens": 745626,
            "total_execution_time_sec": 849.8562
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 27,
            "Incorrect cases": 15,
            "Average distance for correct cases": 0.4074074074074074,
            "Average distance for incorrect cases": 0.13333333333333333,
            "Overall average distance": 0.30952380952380953,
            "Normalized average distance for correct cases": 0.012220289998067776,
            "Normalized average distance for incorrect cases": 0.0037037037037037034,
            "Normalized overall average distance": 0.009178652035794892,
            "Correct step number predictions": 32,
            "Incorrect step number predictions": 10,
            "Step number accuracy": 0.7619047619047619,
            "Step accuracy within +-1": 0.9285714285714286,
            "Step accuracy within +-2": 1.0,
            "Step accuracy within +-3": 1.0,
            "Step accuracy within +-4": 1.0,
            "Step accuracy within +-5": 1.0,
            "total_prompt_tokens": 651288,
            "total_output_tokens": 92546,
            "total_tokens": 743834,
            "total_execution_time_sec": 886.0327
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 72,
        "overall_incorrect_cases": 54,
        "overall_total_cases": 126,
        "overall_accuracy": 0.5714285714285714,
        "overall_avg_distance_for_correct_cases": 0.3888888888888889,
        "overall_avg_distance_for_incorrect_cases": 0.2037037037037037,
        "overall_avg_distance": 0.30952380952380953,
        "overall_normalized_avg_distance_for_correct_cases": 0.011543574738019183,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.009376837154614933,
        "overall_normalized_avg_distance": 0.010614972916560219,
        "overall_correct_step_number_predictions": 96,
        "overall_incorrect_step_number_predictions": 30,
        "overall_step_number_accuracy": 0.7619047619047619,
        "overall_step_accuracy_within_+-1": 0.9365079365079365,
        "overall_step_accuracy_within_+-2": 0.9920634920634921,
        "overall_step_accuracy_within_+-3": 1.0,
        "overall_step_accuracy_within_+-4": 1.0,
        "overall_step_accuracy_within_+-5": 1.0,
        "grand_total_prompt_tokens": 1953864,
        "grand_total_output_tokens": 278636,
        "grand_total_tokens": 2232500,
        "grand_total_execution_time_sec": 2590.1432,
        "avg_prompt_tokens_per_run": 651288.0,
        "avg_output_tokens_per_run": 92878.66666666667,
        "avg_tokens_per_run": 744166.6666666666,
        "avg_execution_time_per_run_sec": 863.3811
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.5714285714285715,
            "std_dev": 0.06299407883487121,
            "variance": 0.003968253968253969,
            "min": 0.5238095238095238,
            "max": 0.6428571428571429,
            "range": 0.11904761904761907,
            "coefficient_of_variation": 0.1102396379610246
        },
        "correct_cases": {
            "mean": 24,
            "std_dev": 2.6457513110645907,
            "variance": 7,
            "min": 22,
            "max": 27,
            "range": 5
        },
        "incorrect_cases": {
            "mean": 18,
            "std_dev": 2.6457513110645907,
            "variance": 7,
            "min": 15,
            "max": 20,
            "range": 5
        },
        "avg_distance_for_correct_cases": {
            "mean": 0.3881081344849461,
            "std_dev": 0.034895430349707206,
            "variance": 0.001217691059291267,
            "min": 0.34782608695652173,
            "max": 0.4090909090909091,
            "range": 0.06126482213438739
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 0.19883040935672514,
            "std_dev": 0.06492018286024756,
            "variance": 0.0042146301426079814,
            "min": 0.13333333333333333,
            "max": 0.2631578947368421,
            "range": 0.12982456140350876
        },
        "overall_avg_distance": {
            "mean": 0.30952380952380953,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.30952380952380953,
            "max": 0.30952380952380953,
            "range": 0.0
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.011511183286911584,
            "std_dev": 0.0010557719144437669,
            "variance": 1.1146543353282565e-06,
            "min": 0.01029783421087769,
            "max": 0.012220289998067776,
            "range": 0.0019224557871900857
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.008971966954423094,
            "std_dev": 0.0049210770261254675,
            "variance": 2.421699909705987e-05,
            "min": 0.0037037037037037034,
            "max": 0.013450292397660818,
            "range": 0.009746588693957114
        },
        "normalized_overall_avg_distance": {
            "mean": 0.010614972916560217,
            "std_dev": 0.001303839702577502,
            "variance": 1.699997970017389e-06,
            "min": 0.009178652035794892,
            "max": 0.011723946247755771,
            "range": 0.0025452942119608794
        },
        "correct_step_number_predictions": {
            "mean": 32,
            "std_dev": 0.0,
            "variance": 0,
            "min": 32,
            "max": 32,
            "range": 0
        },
        "incorrect_step_number_predictions": {
            "mean": 10,
            "std_dev": 0.0,
            "variance": 0,
            "min": 10,
            "max": 10,
            "range": 0
        },
        "step_number_accuracy": {
            "mean": 0.7619047619047619,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 0.7619047619047619,
            "max": 0.7619047619047619,
            "range": 0.0
        },
        "step_accuracy_within_+-1": {
            "mean": 0.9365079365079365,
            "std_dev": 0.013746434980705326,
            "variance": 0.00018896447467875904,
            "min": 0.9285714285714286,
            "max": 0.9523809523809523,
            "range": 0.023809523809523725
        },
        "step_accuracy_within_+-2": {
            "mean": 0.9920634920634921,
            "std_dev": 0.01374643498070539,
            "variance": 0.0001889644746787608,
            "min": 0.9761904761904762,
            "max": 1.0,
            "range": 0.023809523809523836
        },
        "step_accuracy_within_+-3": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-4": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "step_accuracy_within_+-5": {
            "mean": 1.0,
            "std_dev": 0.0,
            "variance": 0.0,
            "min": 1.0,
            "max": 1.0,
            "range": 0.0
        },
        "total_prompt_tokens": {
            "mean": 651288,
            "std_dev": 0.0,
            "variance": 0,
            "min": 651288,
            "max": 651288,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 92878.66666666667,
            "std_dev": 1324.7072632598242,
            "variance": 1754849.3333333333,
            "min": 91752,
            "max": 94338,
            "range": 2586
        },
        "total_tokens": {
            "mean": 744166.6666666666,
            "std_dev": 1324.7072632598242,
            "variance": 1754849.3333333333,
            "min": 743040,
            "max": 745626,
            "range": 2586
        },
        "total_execution_time_sec": {
            "mean": 863.3810666666666,
            "std_dev": 19.73976166024641,
            "variance": 389.65819040333406,
            "min": 849.8562,
            "max": 886.0327,
            "range": 36.17650000000003
        }
    }
}