{
    "num_runs": 3,
    "individual_run_summaries": [
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 16,
            "Incorrect cases": 28,
            "Average distance for correct cases": 7.9375,
            "Average distance for incorrect cases": 34.17857142857143,
            "Overall average distance": 24.636363636363637,
            "Normalized average distance for correct cases": 0.12054486124381786,
            "Normalized average distance for incorrect cases": 0.44800097485540846,
            "Normalized overall average distance": 0.3289260244511937,
            "Correct step number predictions": 14,
            "Incorrect step number predictions": 30,
            "Step number accuracy": 0.3181818181818182,
            "Step accuracy within +-1": 0.4090909090909091,
            "Step accuracy within +-2": 0.4318181818181818,
            "Step accuracy within +-3": 0.4772727272727273,
            "Step accuracy within +-4": 0.5,
            "Step accuracy within +-5": 0.5,
            "total_prompt_tokens": 924094,
            "total_output_tokens": 97088,
            "total_tokens": 1021182,
            "total_execution_time_sec": 969.6259
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 19,
            "Incorrect cases": 25,
            "Average distance for correct cases": 5.2631578947368425,
            "Average distance for incorrect cases": 40.0,
            "Overall average distance": 25.0,
            "Normalized average distance for correct cases": 0.12605507255687132,
            "Normalized average distance for incorrect cases": 0.48718576815665876,
            "Normalized overall average distance": 0.33124296778402323,
            "Correct step number predictions": 15,
            "Incorrect step number predictions": 29,
            "Step number accuracy": 0.3409090909090909,
            "Step accuracy within +-1": 0.45454545454545453,
            "Step accuracy within +-2": 0.5,
            "Step accuracy within +-3": 0.5227272727272727,
            "Step accuracy within +-4": 0.5227272727272727,
            "Step accuracy within +-5": 0.5227272727272727,
            "total_prompt_tokens": 924094,
            "total_output_tokens": 95674,
            "total_tokens": 1019768,
            "total_execution_time_sec": 892.0094
        },
        {
            "model_name": "gpt-5",
            "api_version": "2024-12-01-preview",
            "Correct cases": 14,
            "Incorrect cases": 30,
            "Average distance for correct cases": 9.928571428571429,
            "Average distance for incorrect cases": 34.9,
            "Overall average distance": 26.954545454545453,
            "Normalized average distance for correct cases": 0.15923962266433273,
            "Normalized average distance for incorrect cases": 0.46307496289845346,
            "Normalized overall average distance": 0.36640008191486956,
            "Correct step number predictions": 13,
            "Incorrect step number predictions": 31,
            "Step number accuracy": 0.29545454545454547,
            "Step accuracy within +-1": 0.36363636363636365,
            "Step accuracy within +-2": 0.38636363636363635,
            "Step accuracy within +-3": 0.4318181818181818,
            "Step accuracy within +-4": 0.45454545454545453,
            "Step accuracy within +-5": 0.5,
            "total_prompt_tokens": 924094,
            "total_output_tokens": 100500,
            "total_tokens": 1024594,
            "total_execution_time_sec": 996.4381
        }
    ],
    "aggregate_statistics": {
        "overall_correct_cases": 49,
        "overall_incorrect_cases": 83,
        "overall_total_cases": 132,
        "overall_accuracy": 0.3712121212121212,
        "overall_avg_distance_for_correct_cases": 7.469387755102041,
        "overall_avg_distance_for_incorrect_cases": 36.19277108433735,
        "overall_avg_distance": 25.53030303030303,
        "overall_normalized_avg_distance_for_correct_cases": 0.13373711991392448,
        "overall_normalized_avg_distance_for_incorrect_cases": 0.46525205285327115,
        "overall_normalized_avg_distance": 0.34218969138336214,
        "overall_correct_step_number_predictions": 42,
        "overall_incorrect_step_number_predictions": 90,
        "overall_step_number_accuracy": 0.3181818181818182,
        "overall_step_accuracy_within_+-1": 0.4090909090909091,
        "overall_step_accuracy_within_+-2": 0.4393939393939394,
        "overall_step_accuracy_within_+-3": 0.4772727272727273,
        "overall_step_accuracy_within_+-4": 0.49242424242424243,
        "overall_step_accuracy_within_+-5": 0.5075757575757576,
        "grand_total_prompt_tokens": 2772282,
        "grand_total_output_tokens": 293262,
        "grand_total_tokens": 3065544,
        "grand_total_execution_time_sec": 2858.0734,
        "avg_prompt_tokens_per_run": 924094.0,
        "avg_output_tokens_per_run": 97754.0,
        "avg_tokens_per_run": 1021848.0,
        "avg_execution_time_per_run_sec": 952.6911
    },
    "stability_metrics": {
        "accuracy": {
            "mean": 0.3712121212121212,
            "std_dev": 0.057195715418717805,
            "variance": 0.0032713498622589537,
            "min": 0.3181818181818182,
            "max": 0.4318181818181818,
            "range": 0.11363636363636365,
            "coefficient_of_variation": 0.1540782537810357
        },
        "correct_cases": {
            "mean": 16.333333333333332,
            "std_dev": 2.516611478423583,
            "variance": 6.333333333333333,
            "min": 14,
            "max": 19,
            "range": 5
        },
        "incorrect_cases": {
            "mean": 27.666666666666668,
            "std_dev": 2.516611478423583,
            "variance": 6.333333333333333,
            "min": 25,
            "max": 30,
            "range": 5
        },
        "avg_distance_for_correct_cases": {
            "mean": 7.709743107769424,
            "std_dev": 2.341030918610566,
            "variance": 5.480425761890628,
            "min": 5.2631578947368425,
            "max": 9.928571428571429,
            "range": 4.665413533834586
        },
        "avg_distance_for_incorrect_cases": {
            "mean": 36.35952380952381,
            "std_dev": 3.1733129322502047,
            "variance": 10.069914965986392,
            "min": 34.17857142857143,
            "max": 40.0,
            "range": 5.821428571428569
        },
        "overall_avg_distance": {
            "mean": 25.53030303030303,
            "std_dev": 1.2467588834648906,
            "variance": 1.5544077134986205,
            "min": 24.636363636363637,
            "max": 26.954545454545453,
            "range": 2.3181818181818166
        },
        "normalized_avg_distance_for_correct_cases": {
            "mean": 0.1352798521550073,
            "std_dev": 0.020931879020367018,
            "variance": 0.00043814355932328083,
            "min": 0.12054486124381786,
            "max": 0.15923962266433273,
            "range": 0.03869476142051487
        },
        "normalized_avg_distance_for_incorrect_cases": {
            "mean": 0.4660872353035069,
            "std_dev": 0.019765306607718292,
            "variance": 0.0003906673452971124,
            "min": 0.44800097485540846,
            "max": 0.48718576815665876,
            "range": 0.0391847933012503
        },
        "normalized_overall_avg_distance": {
            "mean": 0.34218969138336214,
            "std_dev": 0.02099879315265435,
            "variance": 0.00044094931386796325,
            "min": 0.3289260244511937,
            "max": 0.36640008191486956,
            "range": 0.03747405746367588
        },
        "correct_step_number_predictions": {
            "mean": 14,
            "std_dev": 1.0,
            "variance": 1,
            "min": 13,
            "max": 15,
            "range": 2
        },
        "incorrect_step_number_predictions": {
            "mean": 30,
            "std_dev": 1.0,
            "variance": 1,
            "min": 29,
            "max": 31,
            "range": 2
        },
        "step_number_accuracy": {
            "mean": 0.3181818181818182,
            "std_dev": 0.022727272727272707,
            "variance": 0.0005165289256198337,
            "min": 0.29545454545454547,
            "max": 0.3409090909090909,
            "range": 0.045454545454545414
        },
        "step_accuracy_within_+-1": {
            "mean": 0.4090909090909091,
            "std_dev": 0.04545454545454544,
            "variance": 0.0020661157024793376,
            "min": 0.36363636363636365,
            "max": 0.45454545454545453,
            "range": 0.09090909090909088
        },
        "step_accuracy_within_+-2": {
            "mean": 0.4393939393939394,
            "std_dev": 0.057195715418717805,
            "variance": 0.0032713498622589537,
            "min": 0.38636363636363635,
            "max": 0.5,
            "range": 0.11363636363636365
        },
        "step_accuracy_within_+-3": {
            "mean": 0.4772727272727273,
            "std_dev": 0.04545454545454544,
            "variance": 0.0020661157024793376,
            "min": 0.4318181818181818,
            "max": 0.5227272727272727,
            "range": 0.09090909090909088
        },
        "step_accuracy_within_+-4": {
            "mean": 0.49242424242424243,
            "std_dev": 0.034716482537544245,
            "variance": 0.0012052341597796144,
            "min": 0.45454545454545453,
            "max": 0.5227272727272727,
            "range": 0.06818181818181818
        },
        "step_accuracy_within_+-5": {
            "mean": 0.5075757575757576,
            "std_dev": 0.013121597027036937,
            "variance": 0.0001721763085399446,
            "min": 0.5,
            "max": 0.5227272727272727,
            "range": 0.022727272727272707
        },
        "total_prompt_tokens": {
            "mean": 924094,
            "std_dev": 0.0,
            "variance": 0,
            "min": 924094,
            "max": 924094,
            "range": 0
        },
        "total_output_tokens": {
            "mean": 97754,
            "std_dev": 2480.9748084170465,
            "variance": 6155236,
            "min": 95674,
            "max": 100500,
            "range": 4826
        },
        "total_tokens": {
            "mean": 1021848,
            "std_dev": 2480.9748084170465,
            "variance": 6155236,
            "min": 1019768,
            "max": 1024594,
            "range": 4826
        },
        "total_execution_time_sec": {
            "mean": 952.6911333333334,
            "std_dev": 54.23493419801789,
            "variance": 2941.42808746333,
            "min": 892.0094,
            "max": 996.4381,
            "range": 104.42869999999994
        }
    }
}