{
    "math_word_problem_generation": {
        "initial_model=gpt-4-0613": {
            "baseline_model=always_error": {
                "total_num": 140,
                "prediction_error_num": 140,
                "gold_error_num": 77,
                "metrics": {
                    "accuracy": 0.55,
                    "precision": 0.55,
                    "recall": 1.0,
                    "f1": 0.7096774193548387,
                    "true_negative_rate": 0.0,
                    "false_positive_rate": 0.45,
                    "false_negative_rate": 0.0,
                    "true_positive_rate": 0.55
                }
            },
            "baseline_model=random": {
                "metrics": {
                    "accuracy": 0.505,
                    "precision": 0.55,
                    "recall": 0.55,
                    "f1": 0.55,
                    "true_negative_rate": 0.20249999999999996,
                    "false_positive_rate": 0.2475,
                    "false_negative_rate": 0.2475,
                    "true_positive_rate": 0.30250000000000005
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=always_error": {
                "total_num": 160,
                "prediction_error_num": 160,
                "gold_error_num": 120,
                "metrics": {
                    "accuracy": 0.75,
                    "precision": 0.75,
                    "recall": 1.0,
                    "f1": 0.8571428571428571,
                    "true_negative_rate": 0.0,
                    "false_positive_rate": 0.25,
                    "false_negative_rate": 0.0,
                    "true_positive_rate": 0.75
                }
            },
            "baseline_model=random": {
                "metrics": {
                    "accuracy": 0.625,
                    "precision": 0.75,
                    "recall": 0.75,
                    "f1": 0.75,
                    "true_negative_rate": 0.0625,
                    "false_positive_rate": 0.1875,
                    "false_negative_rate": 0.1875,
                    "true_positive_rate": 0.5625
                }
            }
        }
    },
    "finegrained_fact_verification": {
        "initial_model=gpt-4-0613": {
            "baseline_model=always_error": {
                "total_num": 140,
                "prediction_error_num": 140,
                "gold_error_num": 78,
                "metrics": {
                    "accuracy": 0.5571428571428572,
                    "precision": 0.5571428571428572,
                    "recall": 1.0,
                    "f1": 0.7155963302752294,
                    "true_negative_rate": 0.0,
                    "false_positive_rate": 0.44285714285714284,
                    "false_negative_rate": 0.0,
                    "true_positive_rate": 0.5571428571428572
                }
            },
            "baseline_model=random": {
                "metrics": {
                    "accuracy": 0.5065306122448979,
                    "precision": 0.5571428571428572,
                    "recall": 0.5571428571428572,
                    "f1": 0.5571428571428572,
                    "true_negative_rate": 0.19612244897959183,
                    "false_positive_rate": 0.246734693877551,
                    "false_negative_rate": 0.246734693877551,
                    "true_positive_rate": 0.3104081632653061
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=always_error": {
                "total_num": 160,
                "prediction_error_num": 160,
                "gold_error_num": 126,
                "metrics": {
                    "accuracy": 0.7875,
                    "precision": 0.7875,
                    "recall": 1.0,
                    "f1": 0.8811188811188811,
                    "true_negative_rate": 0.0,
                    "false_positive_rate": 0.2125,
                    "false_negative_rate": 0.0,
                    "true_positive_rate": 0.7875
                }
            },
            "baseline_model=random": {
                "metrics": {
                    "accuracy": 0.6653125,
                    "precision": 0.7875,
                    "recall": 0.7875,
                    "f1": 0.7875,
                    "true_negative_rate": 0.04515625000000001,
                    "false_positive_rate": 0.16734375,
                    "false_negative_rate": 0.16734375,
                    "true_positive_rate": 0.62015625
                }
            }
        }
    },
    "answerability_classification": {
        "initial_model=gpt-4-0613": {
            "baseline_model=always_error": {
                "total_num": 140,
                "prediction_error_num": 140,
                "gold_error_num": 81,
                "metrics": {
                    "accuracy": 0.5785714285714286,
                    "precision": 0.5785714285714286,
                    "recall": 1.0,
                    "f1": 0.7330316742081447,
                    "true_negative_rate": 0.0,
                    "false_positive_rate": 0.42142857142857143,
                    "false_negative_rate": 0.0,
                    "true_positive_rate": 0.5785714285714286
                }
            },
            "baseline_model=random": {
                "metrics": {
                    "accuracy": 0.5123469387755102,
                    "precision": 0.5785714285714286,
                    "recall": 0.5785714285714286,
                    "f1": 0.5785714285714286,
                    "true_negative_rate": 0.1776020408163265,
                    "false_positive_rate": 0.24382653061224488,
                    "false_negative_rate": 0.24382653061224488,
                    "true_positive_rate": 0.3347448979591837
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=always_error": {
                "total_num": 160,
                "prediction_error_num": 160,
                "gold_error_num": 124,
                "metrics": {
                    "accuracy": 0.775,
                    "precision": 0.775,
                    "recall": 1.0,
                    "f1": 0.8732394366197183,
                    "true_negative_rate": 0.0,
                    "false_positive_rate": 0.225,
                    "false_negative_rate": 0.0,
                    "true_positive_rate": 0.775
                }
            },
            "baseline_model=random": {
                "metrics": {
                    "accuracy": 0.6512500000000001,
                    "precision": 0.775,
                    "recall": 0.775,
                    "f1": 0.775,
                    "true_negative_rate": 0.05062499999999999,
                    "false_positive_rate": 0.17437499999999997,
                    "false_negative_rate": 0.17437499999999997,
                    "true_positive_rate": 0.6006250000000001
                }
            }
        }
    }
}