{
    "math_word_problem_generation": {
        "initial_model=gpt-4-0613": {
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 24,
                    "gold_error_num": 29,
                    "metrics": {
                        "accuracy": 0.62,
                        "precision": 0.75,
                        "recall": 0.6206896551724138,
                        "f1": 0.6792452830188679,
                        "true_negative_rate": 0.3,
                        "false_positive_rate": 0.12,
                        "false_negative_rate": 0.22,
                        "true_positive_rate": 0.36
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 29,
                    "metrics": {
                        "accuracy": {
                            "average": 0.62,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.75,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6206896551724138,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6792452830188679,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.3,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.12,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.22,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.36,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 39,
                    "gold_error_num": 29,
                    "metrics": {
                        "accuracy": 0.76,
                        "precision": 0.717948717948718,
                        "recall": 0.9655172413793104,
                        "f1": 0.8235294117647058,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.22,
                        "false_negative_rate": 0.02,
                        "true_positive_rate": 0.56
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 29,
                    "metrics": {
                        "accuracy": {
                            "average": 0.76,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.717948717948718,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.9655172413793104,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8235294117647058,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.22,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.02,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.56,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 24,
                    "gold_error_num": 29,
                    "metrics": {
                        "accuracy": 0.82,
                        "precision": 0.9166666666666666,
                        "recall": 0.7586206896551724,
                        "f1": 0.8301886792452831,
                        "true_negative_rate": 0.38,
                        "false_positive_rate": 0.04,
                        "false_negative_rate": 0.14,
                        "true_positive_rate": 0.44
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 29,
                    "metrics": {
                        "accuracy": {
                            "average": 0.82,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9166666666666666,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.7586206896551724,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8301886792452831,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.38,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.04,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.14,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.44,
                            "stdev": 0.0
                        }
                    }
                }
            }
        }
    },
    "finegrained_fact_verification": {
        "initial_model=gpt-4-0613": {
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 38,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": 0.56,
                        "precision": 0.6052631578947368,
                        "recall": 0.8214285714285714,
                        "f1": 0.696969696969697,
                        "true_negative_rate": 0.14,
                        "false_positive_rate": 0.3,
                        "false_negative_rate": 0.1,
                        "true_positive_rate": 0.46
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": {
                            "average": 0.56,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6052631578947368,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8214285714285714,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.696969696969697,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.14,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.3,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.1,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.46,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 23,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": 0.58,
                        "precision": 0.6521739130434783,
                        "recall": 0.5357142857142857,
                        "f1": 0.5882352941176471,
                        "true_negative_rate": 0.28,
                        "false_positive_rate": 0.16,
                        "false_negative_rate": 0.26,
                        "true_positive_rate": 0.3
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": {
                            "average": 0.58,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6521739130434783,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5357142857142857,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5882352941176471,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.28,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.16,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.26,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.3,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 4,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": 0.48,
                        "precision": 0.75,
                        "recall": 0.10714285714285714,
                        "f1": 0.1875,
                        "true_negative_rate": 0.42,
                        "false_positive_rate": 0.02,
                        "false_negative_rate": 0.5,
                        "true_positive_rate": 0.06
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": {
                            "average": 0.48,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.75,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.10714285714285714,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.1875,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.42,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.02,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.06,
                            "stdev": 0.0
                        }
                    }
                }
            }
        }
    },
    "answerability_classification": {
        "initial_model=gpt-4-0613": {
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 23,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": 0.36,
                        "precision": 0.43478260869565216,
                        "recall": 0.35714285714285715,
                        "f1": 0.39215686274509803,
                        "true_negative_rate": 0.18,
                        "false_positive_rate": 0.26,
                        "false_negative_rate": 0.36,
                        "true_positive_rate": 0.2
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": {
                            "average": 0.36,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.43478260869565216,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.35714285714285715,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.39215686274509803,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.18,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.26,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.36,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 19,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": 0.54,
                        "precision": 0.631578947368421,
                        "recall": 0.42857142857142855,
                        "f1": 0.5106382978723404,
                        "true_negative_rate": 0.3,
                        "false_positive_rate": 0.14,
                        "false_negative_rate": 0.32,
                        "true_positive_rate": 0.24
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": {
                            "average": 0.54,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.631578947368421,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.42857142857142855,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5106382978723404,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.3,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.14,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.32,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.24,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 50,
                    "prediction_error_num": 3,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": 0.46,
                        "precision": 0.6666666666666666,
                        "recall": 0.07142857142857142,
                        "f1": 0.12903225806451613,
                        "true_negative_rate": 0.42,
                        "false_positive_rate": 0.02,
                        "false_negative_rate": 0.52,
                        "true_positive_rate": 0.04
                    }
                },
                "average": {
                    "total_num": 50,
                    "gold_error_num": 28,
                    "metrics": {
                        "accuracy": {
                            "average": 0.46,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6666666666666666,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.07142857142857142,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.12903225806451613,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.42,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.02,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.52,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.04,
                            "stdev": 0.0
                        }
                    }
                }
            }
        }
    },
    "average": {
        "initial_model=gpt-4-0613": {
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "average": {
                    "total_num": 150,
                    "gold_error_num": 85,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5133333333333333,
                            "stdev": 0.11115554667022046
                        },
                        "precision": {
                            "average": 0.5966819221967963,
                            "stdev": 0.1288299373845414
                        },
                        "recall": {
                            "average": 0.5997536945812808,
                            "stdev": 0.1901210876553047
                        },
                        "f1": {
                            "average": 0.589457280911221,
                            "stdev": 0.13969998799539515
                        },
                        "true_negative_rate": {
                            "average": 0.20666666666666667,
                            "stdev": 0.06798692684790379
                        },
                        "false_positive_rate": {
                            "average": 0.22666666666666666,
                            "stdev": 0.0771722460186015
                        },
                        "false_negative_rate": {
                            "average": 0.22666666666666666,
                            "stdev": 0.10624918300339485
                        },
                        "true_positive_rate": {
                            "average": 0.34,
                            "stdev": 0.10708252269472673
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "average": {
                    "total_num": 150,
                    "gold_error_num": 85,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6266666666666666,
                            "stdev": 0.09568466729604883
                        },
                        "precision": {
                            "average": 0.667233859453539,
                            "stdev": 0.036833280303992376
                        },
                        "recall": {
                            "average": 0.6432676518883416,
                            "stdev": 0.23202513715042045
                        },
                        "f1": {
                            "average": 0.6408010012515644,
                            "stdev": 0.1330352771838304
                        },
                        "true_negative_rate": {
                            "average": 0.26,
                            "stdev": 0.04320493798938573
                        },
                        "false_positive_rate": {
                            "average": 0.17333333333333334,
                            "stdev": 0.033993463423951896
                        },
                        "false_negative_rate": {
                            "average": 0.20000000000000004,
                            "stdev": 0.12961481396815722
                        },
                        "true_positive_rate": {
                            "average": 0.3666666666666667,
                            "stdev": 0.13888444437333108
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "average": {
                    "total_num": 150,
                    "gold_error_num": 85,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5866666666666666,
                            "stdev": 0.16519348924485153
                        },
                        "precision": {
                            "average": 0.7777777777777777,
                            "stdev": 0.10393492741038725
                        },
                        "recall": {
                            "average": 0.3123973727422003,
                            "stdev": 0.31586422607256576
                        },
                        "f1": {
                            "average": 0.3822403124365998,
                            "stdev": 0.31764542465531553
                        },
                        "true_negative_rate": {
                            "average": 0.4066666666666667,
                            "stdev": 0.018856180831641256
                        },
                        "false_positive_rate": {
                            "average": 0.02666666666666667,
                            "stdev": 0.009428090415820633
                        },
                        "false_negative_rate": {
                            "average": 0.3866666666666667,
                            "stdev": 0.17461067804945057
                        },
                        "true_positive_rate": {
                            "average": 0.18000000000000002,
                            "stdev": 0.1840289832245635
                        }
                    }
                }
            }
        }
    }
}