{
    "math_word_problem_generation": {
        "gpt-4-0613": {
            "total_num": 34,
            "prediction_error_num": 18,
            "gold_error_num": 22,
            "metrics": {
                "accuracy": 0.8823529411764706,
                "precision": 1.0,
                "recall": 0.8181818181818182,
                "f1": 0.9,
                "true_negative_rate": 0.35294117647058826,
                "false_positive_rate": 0.0,
                "false_negative_rate": 0.11764705882352941,
                "true_positive_rate": 0.5294117647058824
            }
        },
        "Llama-2-70b-chat-hf": {
            "total_num": 34,
            "prediction_error_num": 29,
            "gold_error_num": 30,
            "metrics": {
                "accuracy": 0.9705882352941176,
                "precision": 1.0,
                "recall": 0.9666666666666667,
                "f1": 0.9830508474576272,
                "true_negative_rate": 0.11764705882352941,
                "false_positive_rate": 0.0,
                "false_negative_rate": 0.029411764705882353,
                "true_positive_rate": 0.8529411764705882
            }
        }
    },
    "finegrained_fact_verification": {
        "gpt-4-0613": {
            "total_num": 35,
            "prediction_error_num": 22,
            "gold_error_num": 22,
            "metrics": {
                "accuracy": 0.9428571428571428,
                "precision": 0.9545454545454546,
                "recall": 0.9545454545454546,
                "f1": 0.9545454545454546,
                "true_negative_rate": 0.34285714285714286,
                "false_positive_rate": 0.02857142857142857,
                "false_negative_rate": 0.02857142857142857,
                "true_positive_rate": 0.6
            }
        },
        "Llama-2-70b-chat-hf": {
            "total_num": 35,
            "prediction_error_num": 30,
            "gold_error_num": 30,
            "metrics": {
                "accuracy": 1.0,
                "precision": 1.0,
                "recall": 1.0,
                "f1": 1.0,
                "true_negative_rate": 0.14285714285714285,
                "false_positive_rate": 0.0,
                "false_negative_rate": 0.0,
                "true_positive_rate": 0.8571428571428571
            }
        }
    },
    "answerability_classification": {
        "gpt-4-0613": {
            "total_num": 35,
            "prediction_error_num": 19,
            "gold_error_num": 23,
            "metrics": {
                "accuracy": 0.8285714285714286,
                "precision": 0.9473684210526315,
                "recall": 0.782608695652174,
                "f1": 0.8571428571428571,
                "true_negative_rate": 0.3142857142857143,
                "false_positive_rate": 0.02857142857142857,
                "false_negative_rate": 0.14285714285714285,
                "true_positive_rate": 0.5142857142857142
            }
        },
        "Llama-2-70b-chat-hf": {
            "total_num": 35,
            "prediction_error_num": 30,
            "gold_error_num": 29,
            "metrics": {
                "accuracy": 0.9714285714285714,
                "precision": 0.9666666666666667,
                "recall": 1.0,
                "f1": 0.9830508474576272,
                "true_negative_rate": 0.14285714285714285,
                "false_positive_rate": 0.02857142857142857,
                "false_negative_rate": 0.0,
                "true_positive_rate": 0.8285714285714286
            }
        }
    },
    "average": {
        "gpt-4-0613": {
            "metrics": {
                "accuracy": {
                    "average": 0.884593837535014,
                    "stdev": 0.04668384684817847
                },
                "precision": {
                    "average": 0.967304625199362,
                    "stdev": 0.023304049754708872
                },
                "recall": {
                    "average": 0.8517786561264823,
                    "stdev": 0.0741040840286667
                },
                "f1": {
                    "average": 0.9038961038961039,
                    "stdev": 0.03985976429408525
                },
                "true_negative_rate": {
                    "average": 0.33669467787114843,
                    "stdev": 0.016371584533560187
                },
                "false_positive_rate": {
                    "average": 0.019047619047619046,
                    "stdev": 0.013468700594029477
                },
                "false_negative_rate": {
                    "average": 0.09635854341736694,
                    "stdev": 0.049025209763942146
                },
                "true_positive_rate": {
                    "average": 0.5478991596638655,
                    "stdev": 0.037354808018307345
                }
            }
        },
        "Llama-2-70b-chat-hf": {
            "metrics": {
                "accuracy": {
                    "average": 0.9806722689075631,
                    "stdev": 0.013671074892899085
                },
                "precision": {
                    "average": 0.9888888888888889,
                    "stdev": 0.01571348402636772
                },
                "recall": {
                    "average": 0.9888888888888889,
                    "stdev": 0.01571348402636772
                },
                "f1": {
                    "average": 0.9887005649717514,
                    "stdev": 0.007989907132051364
                },
                "true_negative_rate": {
                    "average": 0.13445378151260504,
                    "stdev": 0.011884147582967181
                },
                "false_positive_rate": {
                    "average": 0.009523809523809523,
                    "stdev": 0.013468700594029479
                },
                "false_negative_rate": {
                    "average": 0.00980392156862745,
                    "stdev": 0.01386483884679505
                },
                "true_positive_rate": {
                    "average": 0.846218487394958,
                    "stdev": 0.012595701487908704
                }
            }
        }
    }
}