{
    "math_word_problem_generation": {
        "initial_model=gpt-4-0613": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 56,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.18571428571428572,
                        "precision": 0.4642857142857143,
                        "recall": 0.33766233766233766,
                        "f1": 0.39097744360902253,
                        "true_negative_rate": 0.2357142857142857,
                        "false_positive_rate": 0.21428571428571427,
                        "false_negative_rate": 0.36428571428571427,
                        "true_positive_rate": 0.18571428571428572
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 118,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.5423728813559322,
                        "recall": 0.8311688311688312,
                        "f1": 0.6564102564102564,
                        "true_negative_rate": 0.06428571428571428,
                        "false_positive_rate": 0.38571428571428573,
                        "false_negative_rate": 0.09285714285714286,
                        "true_positive_rate": 0.45714285714285713
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 114,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5214285714285715,
                        "precision": 0.543859649122807,
                        "recall": 0.8051948051948052,
                        "f1": 0.6492146596858639,
                        "true_negative_rate": 0.07857142857142857,
                        "false_positive_rate": 0.37142857142857144,
                        "false_negative_rate": 0.10714285714285714,
                        "true_positive_rate": 0.44285714285714284
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 4,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.4642857142857143,
                        "precision": 0.75,
                        "recall": 0.03896103896103896,
                        "f1": 0.07407407407407407,
                        "true_negative_rate": 0.44285714285714284,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5285714285714286,
                        "true_positive_rate": 0.02142857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4178571428571429,
                            "stdev": 0.13557323497215934
                        },
                        "precision": {
                            "average": 0.5751295611911134,
                            "stdev": 0.10596796242330071
                        },
                        "recall": {
                            "average": 0.5032467532467533,
                            "stdev": 0.33229690731264155
                        },
                        "f1": {
                            "average": 0.4426691084448042,
                            "stdev": 0.238159962340082
                        },
                        "true_negative_rate": {
                            "average": 0.20535714285714285,
                            "stdev": 0.1527281665480957
                        },
                        "false_positive_rate": {
                            "average": 0.24464285714285713,
                            "stdev": 0.1527281665480957
                        },
                        "false_negative_rate": {
                            "average": 0.27321428571428574,
                            "stdev": 0.18276329902195285
                        },
                        "true_positive_rate": {
                            "average": 0.27678571428571425,
                            "stdev": 0.18276329902195285
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 91,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.37142857142857144,
                        "precision": 0.5164835164835165,
                        "recall": 0.6103896103896104,
                        "f1": 0.5595238095238095,
                        "true_negative_rate": 0.1357142857142857,
                        "false_positive_rate": 0.3142857142857143,
                        "false_negative_rate": 0.21428571428571427,
                        "true_positive_rate": 0.3357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 58,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.603448275862069,
                        "recall": 0.45454545454545453,
                        "f1": 0.5185185185185185,
                        "true_negative_rate": 0.2857142857142857,
                        "false_positive_rate": 0.16428571428571428,
                        "false_negative_rate": 0.3,
                        "true_positive_rate": 0.25
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 114,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5785714285714286,
                        "precision": 0.5789473684210527,
                        "recall": 0.8571428571428571,
                        "f1": 0.6910994764397905,
                        "true_negative_rate": 0.10714285714285714,
                        "false_positive_rate": 0.34285714285714286,
                        "false_negative_rate": 0.07857142857142857,
                        "true_positive_rate": 0.4714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 32,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.44285714285714284,
                        "precision": 0.5,
                        "recall": 0.2077922077922078,
                        "f1": 0.29357798165137616,
                        "true_negative_rate": 0.3357142857142857,
                        "false_positive_rate": 0.11428571428571428,
                        "false_negative_rate": 0.4357142857142857,
                        "true_positive_rate": 0.11428571428571428
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4714285714285715,
                            "stdev": 0.07542397172122457
                        },
                        "precision": {
                            "average": 0.5497197901916595,
                            "stdev": 0.0427718041713695
                        },
                        "recall": {
                            "average": 0.5324675324675324,
                            "stdev": 0.23609946704802393
                        },
                        "f1": {
                            "average": 0.5156799465333737,
                            "stdev": 0.14320582747384208
                        },
                        "true_negative_rate": {
                            "average": 0.21607142857142853,
                            "stdev": 0.09680811548878428
                        },
                        "false_positive_rate": {
                            "average": 0.23392857142857143,
                            "stdev": 0.09680811548878428
                        },
                        "false_negative_rate": {
                            "average": 0.2571428571428571,
                            "stdev": 0.12985470687641318
                        },
                        "true_positive_rate": {
                            "average": 0.2928571428571428,
                            "stdev": 0.12985470687641318
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.55,
                        "recall": 1.0,
                        "f1": 0.7096774193548387,
                        "true_negative_rate": 0.0,
                        "false_positive_rate": 0.45,
                        "false_negative_rate": 0.0,
                        "true_positive_rate": 0.55
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 127,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6214285714285714,
                        "precision": 0.5984251968503937,
                        "recall": 0.987012987012987,
                        "f1": 0.7450980392156863,
                        "true_negative_rate": 0.08571428571428572,
                        "false_positive_rate": 0.36428571428571427,
                        "false_negative_rate": 0.007142857142857143,
                        "true_positive_rate": 0.5428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.55,
                        "recall": 1.0,
                        "f1": 0.7096774193548387,
                        "true_negative_rate": 0.0,
                        "false_positive_rate": 0.45,
                        "false_negative_rate": 0.0,
                        "true_positive_rate": 0.55
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 2,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.4642857142857143,
                        "precision": 1.0,
                        "recall": 0.025974025974025976,
                        "f1": 0.05063291139240506,
                        "true_negative_rate": 0.45,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.5357142857142857,
                        "true_positive_rate": 0.014285714285714285
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5464285714285715,
                            "stdev": 0.055673061671856774
                        },
                        "precision": {
                            "average": 0.6746062992125985,
                            "stdev": 0.1889034677263502
                        },
                        "recall": {
                            "average": 0.7532467532467532,
                            "stdev": 0.41992457699285046
                        },
                        "f1": {
                            "average": 0.5537714473294422,
                            "stdev": 0.2908468649195131
                        },
                        "true_negative_rate": {
                            "average": 0.13392857142857142,
                            "stdev": 0.18580869852883528
                        },
                        "false_positive_rate": {
                            "average": 0.31607142857142856,
                            "stdev": 0.18580869852883528
                        },
                        "false_negative_rate": {
                            "average": 0.1357142857142857,
                            "stdev": 0.23095851734606776
                        },
                        "true_positive_rate": {
                            "average": 0.4142857142857143,
                            "stdev": 0.23095851734606776
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 66,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5571428571428572,
                        "precision": 0.6363636363636364,
                        "recall": 0.5454545454545454,
                        "f1": 0.5874125874125874,
                        "true_negative_rate": 0.2785714285714286,
                        "false_positive_rate": 0.17142857142857143,
                        "false_negative_rate": 0.25,
                        "true_positive_rate": 0.3
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 51,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6071428571428571,
                        "precision": 0.7254901960784313,
                        "recall": 0.4805194805194805,
                        "f1": 0.578125,
                        "true_negative_rate": 0.35,
                        "false_positive_rate": 0.1,
                        "false_negative_rate": 0.2857142857142857,
                        "true_positive_rate": 0.2642857142857143
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 34,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5357142857142857,
                        "precision": 0.7352941176470589,
                        "recall": 0.3246753246753247,
                        "f1": 0.45045045045045046,
                        "true_negative_rate": 0.38571428571428573,
                        "false_positive_rate": 0.06428571428571428,
                        "false_negative_rate": 0.37142857142857144,
                        "true_positive_rate": 0.17857142857142858
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 15,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5142857142857142,
                        "precision": 0.8,
                        "recall": 0.15584415584415584,
                        "f1": 0.2608695652173913,
                        "true_negative_rate": 0.42857142857142855,
                        "false_positive_rate": 0.02142857142857143,
                        "false_negative_rate": 0.4642857142857143,
                        "true_positive_rate": 0.08571428571428572
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5535714285714285,
                            "stdev": 0.03444160986068912
                        },
                        "precision": {
                            "average": 0.7242869875222817,
                            "stdev": 0.05827859933507103
                        },
                        "recall": {
                            "average": 0.37662337662337664,
                            "stdev": 0.15061575980185976
                        },
                        "f1": {
                            "average": 0.46921440077010734,
                            "stdev": 0.13190146538540776
                        },
                        "true_negative_rate": {
                            "average": 0.3607142857142857,
                            "stdev": 0.05498144399407375
                        },
                        "false_positive_rate": {
                            "average": 0.08928571428571429,
                            "stdev": 0.054981443994073766
                        },
                        "false_negative_rate": {
                            "average": 0.34285714285714286,
                            "stdev": 0.08283866789102289
                        },
                        "true_positive_rate": {
                            "average": 0.20714285714285716,
                            "stdev": 0.08283866789102287
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 127,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5642857142857143,
                        "precision": 0.5669291338582677,
                        "recall": 0.935064935064935,
                        "f1": 0.7058823529411765,
                        "true_negative_rate": 0.05714285714285714,
                        "false_positive_rate": 0.39285714285714285,
                        "false_negative_rate": 0.03571428571428571,
                        "true_positive_rate": 0.5142857142857142
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 72,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6642857142857143,
                        "precision": 0.75,
                        "recall": 0.7012987012987013,
                        "f1": 0.7248322147651006,
                        "true_negative_rate": 0.32142857142857145,
                        "false_positive_rate": 0.12857142857142856,
                        "false_negative_rate": 0.16428571428571428,
                        "true_positive_rate": 0.38571428571428573
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 24,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.625,
                        "recall": 0.19480519480519481,
                        "f1": 0.297029702970297,
                        "true_negative_rate": 0.38571428571428573,
                        "false_positive_rate": 0.06428571428571428,
                        "false_negative_rate": 0.44285714285714284,
                        "true_positive_rate": 0.10714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 14,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.7142857142857143,
                        "recall": 0.12987012987012986,
                        "f1": 0.21978021978021978,
                        "true_negative_rate": 0.42142857142857143,
                        "false_positive_rate": 0.02857142857142857,
                        "false_negative_rate": 0.4785714285714286,
                        "true_positive_rate": 0.07142857142857142
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5517857142857143,
                            "stdev": 0.07185142692403823
                        },
                        "precision": {
                            "average": 0.6640537120359955,
                            "stdev": 0.07222948597144963
                        },
                        "recall": {
                            "average": 0.49025974025974023,
                            "stdev": 0.33895544106438924
                        },
                        "f1": {
                            "average": 0.48688112261419847,
                            "stdev": 0.23020030135518313
                        },
                        "true_negative_rate": {
                            "average": 0.29642857142857143,
                            "stdev": 0.14272315144777203
                        },
                        "false_positive_rate": {
                            "average": 0.15357142857142855,
                            "stdev": 0.14272315144777203
                        },
                        "false_negative_rate": {
                            "average": 0.28035714285714286,
                            "stdev": 0.1864254925854141
                        },
                        "true_positive_rate": {
                            "average": 0.26964285714285713,
                            "stdev": 0.1864254925854141
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 55,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5857142857142857,
                        "precision": 0.6727272727272727,
                        "recall": 0.4805194805194805,
                        "f1": 0.5606060606060606,
                        "true_negative_rate": 0.32142857142857145,
                        "false_positive_rate": 0.12857142857142856,
                        "false_negative_rate": 0.2857142857142857,
                        "true_positive_rate": 0.2642857142857143
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 14,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.7142857142857143,
                        "recall": 0.12987012987012986,
                        "f1": 0.21978021978021978,
                        "true_negative_rate": 0.42142857142857143,
                        "false_positive_rate": 0.02857142857142857,
                        "false_negative_rate": 0.4785714285714286,
                        "true_positive_rate": 0.07142857142857142
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 30,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5214285714285715,
                        "precision": 0.6666666666666666,
                        "recall": 0.2597402597402597,
                        "f1": 0.37383177570093457,
                        "true_negative_rate": 0.37857142857142856,
                        "false_positive_rate": 0.07142857142857142,
                        "false_negative_rate": 0.40714285714285714,
                        "true_positive_rate": 0.14285714285714285
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 5,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 1.0,
                        "recall": 0.06493506493506493,
                        "f1": 0.12195121951219512,
                        "true_negative_rate": 0.45,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.5142857142857142,
                        "true_positive_rate": 0.03571428571428571
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5214285714285715,
                            "stdev": 0.03944771791852594
                        },
                        "precision": {
                            "average": 0.7634199134199134,
                            "stdev": 0.13781387151145152
                        },
                        "recall": {
                            "average": 0.23376623376623376,
                            "stdev": 0.15879245793035693
                        },
                        "f1": {
                            "average": 0.31904231889985246,
                            "stdev": 0.16587102795781763
                        },
                        "true_negative_rate": {
                            "average": 0.39285714285714285,
                            "stdev": 0.04844521416518048
                        },
                        "false_positive_rate": {
                            "average": 0.05714285714285714,
                            "stdev": 0.04844521416518049
                        },
                        "false_negative_rate": {
                            "average": 0.4214285714285714,
                            "stdev": 0.0873358518616963
                        },
                        "true_positive_rate": {
                            "average": 0.1285714285714286,
                            "stdev": 0.08733585186169632
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 119,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6714285714285714,
                        "precision": 0.6302521008403361,
                        "recall": 0.974025974025974,
                        "f1": 0.7653061224489796,
                        "true_negative_rate": 0.1357142857142857,
                        "false_positive_rate": 0.3142857142857143,
                        "false_negative_rate": 0.014285714285714285,
                        "true_positive_rate": 0.5357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 98,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6357142857142857,
                        "precision": 0.6326530612244898,
                        "recall": 0.8051948051948052,
                        "f1": 0.7085714285714285,
                        "true_negative_rate": 0.19285714285714287,
                        "false_positive_rate": 0.2571428571428571,
                        "false_negative_rate": 0.10714285714285714,
                        "true_positive_rate": 0.44285714285714284
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 59,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6571428571428571,
                        "precision": 0.7457627118644068,
                        "recall": 0.5714285714285714,
                        "f1": 0.6470588235294118,
                        "true_negative_rate": 0.34285714285714286,
                        "false_positive_rate": 0.10714285714285714,
                        "false_negative_rate": 0.2357142857142857,
                        "true_positive_rate": 0.3142857142857143
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 26,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.7692307692307693,
                        "recall": 0.2597402597402597,
                        "f1": 0.3883495145631068,
                        "true_negative_rate": 0.40714285714285714,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.40714285714285714,
                        "true_positive_rate": 0.14285714285714285
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6285714285714286,
                            "stdev": 0.04711037842240325
                        },
                        "precision": {
                            "average": 0.6944746607900005,
                            "stdev": 0.06357158821271
                        },
                        "recall": {
                            "average": 0.6525974025974026,
                            "stdev": 0.26810791095064584
                        },
                        "f1": {
                            "average": 0.6273214722782317,
                            "stdev": 0.14416871946525187
                        },
                        "true_negative_rate": {
                            "average": 0.2696428571428572,
                            "stdev": 0.10965797941317172
                        },
                        "false_positive_rate": {
                            "average": 0.18035714285714283,
                            "stdev": 0.1096579794131717
                        },
                        "false_negative_rate": {
                            "average": 0.19107142857142856,
                            "stdev": 0.1474593510228552
                        },
                        "true_positive_rate": {
                            "average": 0.3589285714285714,
                            "stdev": 0.1474593510228552
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 99,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5571428571428572,
                        "precision": 0.5757575757575758,
                        "recall": 0.7402597402597403,
                        "f1": 0.6477272727272727,
                        "true_negative_rate": 0.15,
                        "false_positive_rate": 0.3,
                        "false_negative_rate": 0.14285714285714285,
                        "true_positive_rate": 0.40714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 16,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.4642857142857143,
                        "precision": 0.5625,
                        "recall": 0.11688311688311688,
                        "f1": 0.1935483870967742,
                        "true_negative_rate": 0.4,
                        "false_positive_rate": 0.05,
                        "false_negative_rate": 0.4857142857142857,
                        "true_positive_rate": 0.06428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 93,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6,
                        "precision": 0.6129032258064516,
                        "recall": 0.7402597402597403,
                        "f1": 0.6705882352941176,
                        "true_negative_rate": 0.19285714285714287,
                        "false_positive_rate": 0.2571428571428571,
                        "false_negative_rate": 0.14285714285714285,
                        "true_positive_rate": 0.40714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 1,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.44285714285714284,
                        "precision": 0.0,
                        "recall": 0.0,
                        "f1": 0.0,
                        "true_negative_rate": 0.44285714285714284,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.55,
                        "true_positive_rate": 0.0
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5160714285714285,
                            "stdev": 0.06475522987310266
                        },
                        "precision": {
                            "average": 0.4377902003910068,
                            "stdev": 0.25343261531056094
                        },
                        "recall": {
                            "average": 0.39935064935064934,
                            "stdev": 0.34340459539878304
                        },
                        "f1": {
                            "average": 0.3779659737795411,
                            "stdev": 0.2895112562392373
                        },
                        "true_negative_rate": {
                            "average": 0.29642857142857143,
                            "stdev": 0.12682343503260501
                        },
                        "false_positive_rate": {
                            "average": 0.15357142857142855,
                            "stdev": 0.12682343503260501
                        },
                        "false_negative_rate": {
                            "average": 0.33035714285714285,
                            "stdev": 0.18887252746933067
                        },
                        "true_positive_rate": {
                            "average": 0.21964285714285714,
                            "stdev": 0.18887252746933067
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 38,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.65,
                        "precision": 0.868421052631579,
                        "recall": 0.42857142857142855,
                        "f1": 0.5739130434782609,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.03571428571428571,
                        "false_negative_rate": 0.3142857142857143,
                        "true_positive_rate": 0.2357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 24,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6071428571428571,
                        "precision": 0.9583333333333334,
                        "recall": 0.2987012987012987,
                        "f1": 0.45544554455445546,
                        "true_negative_rate": 0.44285714285714284,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.38571428571428573,
                        "true_positive_rate": 0.16428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 56,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6928571428571428,
                        "precision": 0.8035714285714286,
                        "recall": 0.5844155844155844,
                        "f1": 0.6766917293233082,
                        "true_negative_rate": 0.37142857142857144,
                        "false_positive_rate": 0.07857142857142857,
                        "false_negative_rate": 0.22857142857142856,
                        "true_positive_rate": 0.32142857142857145
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 17,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5714285714285714,
                        "precision": 1.0,
                        "recall": 0.22077922077922077,
                        "f1": 0.3617021276595745,
                        "true_negative_rate": 0.45,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.42857142857142855,
                        "true_positive_rate": 0.12142857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6303571428571428,
                            "stdev": 0.0455619672220467
                        },
                        "precision": {
                            "average": 0.9075814536340852,
                            "stdev": 0.07659726386367735
                        },
                        "recall": {
                            "average": 0.38311688311688313,
                            "stdev": 0.13790104273763182
                        },
                        "f1": {
                            "average": 0.5169381112538998,
                            "stdev": 0.11900305432006798
                        },
                        "true_negative_rate": {
                            "average": 0.41964285714285715,
                            "stdev": 0.030877886546054605
                        },
                        "false_positive_rate": {
                            "average": 0.030357142857142857,
                            "stdev": 0.03087788654605461
                        },
                        "false_negative_rate": {
                            "average": 0.33928571428571425,
                            "stdev": 0.0758455735056975
                        },
                        "true_positive_rate": {
                            "average": 0.21071428571428574,
                            "stdev": 0.07584557350569751
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 55,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.7428571428571429,
                        "precision": 0.8727272727272727,
                        "recall": 0.6233766233766234,
                        "f1": 0.7272727272727273,
                        "true_negative_rate": 0.4,
                        "false_positive_rate": 0.05,
                        "false_negative_rate": 0.20714285714285716,
                        "true_positive_rate": 0.34285714285714286
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 48,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.7214285714285714,
                        "precision": 0.8958333333333334,
                        "recall": 0.5584415584415584,
                        "f1": 0.688,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.03571428571428571,
                        "false_negative_rate": 0.24285714285714285,
                        "true_positive_rate": 0.30714285714285716
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 42,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6928571428571428,
                        "precision": 0.9047619047619048,
                        "recall": 0.4935064935064935,
                        "f1": 0.6386554621848739,
                        "true_negative_rate": 0.42142857142857143,
                        "false_positive_rate": 0.02857142857142857,
                        "false_negative_rate": 0.2785714285714286,
                        "true_positive_rate": 0.2714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 32,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.65,
                        "precision": 0.9375,
                        "recall": 0.38961038961038963,
                        "f1": 0.5504587155963303,
                        "true_negative_rate": 0.4357142857142857,
                        "false_positive_rate": 0.014285714285714285,
                        "false_negative_rate": 0.3357142857142857,
                        "true_positive_rate": 0.21428571428571427
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.7017857142857143,
                            "stdev": 0.034764147024878185
                        },
                        "precision": {
                            "average": 0.9027056277056277,
                            "stdev": 0.023242239650253616
                        },
                        "recall": {
                            "average": 0.5162337662337662,
                            "stdev": 0.08632945324875538
                        },
                        "f1": {
                            "average": 0.6510967262634828,
                            "stdev": 0.06604435817151287
                        },
                        "true_negative_rate": {
                            "average": 0.41785714285714287,
                            "stdev": 0.012876968840942813
                        },
                        "false_positive_rate": {
                            "average": 0.03214285714285714,
                            "stdev": 0.01287696884094282
                        },
                        "false_negative_rate": {
                            "average": 0.26607142857142857,
                            "stdev": 0.047481199286815455
                        },
                        "true_positive_rate": {
                            "average": 0.2839285714285714,
                            "stdev": 0.047481199286815476
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 67,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6857142857142857,
                        "precision": 0.746268656716418,
                        "recall": 0.6493506493506493,
                        "f1": 0.6944444444444444,
                        "true_negative_rate": 0.32857142857142857,
                        "false_positive_rate": 0.12142857142857143,
                        "false_negative_rate": 0.19285714285714287,
                        "true_positive_rate": 0.35714285714285715
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 62,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.7214285714285714,
                        "precision": 0.8064516129032258,
                        "recall": 0.6493506493506493,
                        "f1": 0.7194244604316546,
                        "true_negative_rate": 0.36428571428571427,
                        "false_positive_rate": 0.08571428571428572,
                        "false_negative_rate": 0.19285714285714287,
                        "true_positive_rate": 0.35714285714285715
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 61,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.7285714285714285,
                        "precision": 0.819672131147541,
                        "recall": 0.6493506493506493,
                        "f1": 0.7246376811594203,
                        "true_negative_rate": 0.37142857142857144,
                        "false_positive_rate": 0.07857142857142857,
                        "false_negative_rate": 0.19285714285714287,
                        "true_positive_rate": 0.35714285714285715
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 44,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6785714285714286,
                        "precision": 0.8636363636363636,
                        "recall": 0.4935064935064935,
                        "f1": 0.628099173553719,
                        "true_negative_rate": 0.40714285714285714,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.2785714285714286,
                        "true_positive_rate": 0.2714285714285714
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.7035714285714285,
                            "stdev": 0.02172415189392219
                        },
                        "precision": {
                            "average": 0.8090071911008871,
                            "stdev": 0.041954565051806836
                        },
                        "recall": {
                            "average": 0.6103896103896104,
                            "stdev": 0.06748249899619002
                        },
                        "f1": {
                            "average": 0.6916514398973096,
                            "stdev": 0.038425656188314845
                        },
                        "true_negative_rate": {
                            "average": 0.3678571428571429,
                            "stdev": 0.027893748842523765
                        },
                        "false_positive_rate": {
                            "average": 0.08214285714285714,
                            "stdev": 0.027893748842523765
                        },
                        "false_negative_rate": {
                            "average": 0.2142857142857143,
                            "stdev": 0.03711537444790451
                        },
                        "true_positive_rate": {
                            "average": 0.3357142857142857,
                            "stdev": 0.037115374447904526
                        }
                    }
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 65,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.25,
                        "precision": 0.5692307692307692,
                        "recall": 0.30833333333333335,
                        "f1": 0.4,
                        "true_negative_rate": 0.075,
                        "false_positive_rate": 0.175,
                        "false_negative_rate": 0.51875,
                        "true_positive_rate": 0.23125
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 125,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.6625,
                        "precision": 0.768,
                        "recall": 0.8,
                        "f1": 0.7836734693877551,
                        "true_negative_rate": 0.06875,
                        "false_positive_rate": 0.18125,
                        "false_negative_rate": 0.15,
                        "true_positive_rate": 0.6
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 133,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.75625,
                        "precision": 0.8045112781954887,
                        "recall": 0.8916666666666667,
                        "f1": 0.8458498023715415,
                        "true_negative_rate": 0.0875,
                        "false_positive_rate": 0.1625,
                        "false_negative_rate": 0.08125,
                        "true_positive_rate": 0.66875
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 4,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.2625,
                        "precision": 0.75,
                        "recall": 0.025,
                        "f1": 0.04838709677419355,
                        "true_negative_rate": 0.24375,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.73125,
                        "true_positive_rate": 0.01875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4828125,
                            "stdev": 0.2290168781307395
                        },
                        "precision": {
                            "average": 0.7229355118565645,
                            "stdev": 0.09088873440853484
                        },
                        "recall": {
                            "average": 0.50625,
                            "stdev": 0.3555304624391871
                        },
                        "f1": {
                            "average": 0.5194775921333725,
                            "stdev": 0.32113837788938643
                        },
                        "true_negative_rate": {
                            "average": 0.11875,
                            "stdev": 0.07248383440464501
                        },
                        "false_positive_rate": {
                            "average": 0.13124999999999998,
                            "stdev": 0.07248383440464501
                        },
                        "false_negative_rate": {
                            "average": 0.37031250000000004,
                            "stdev": 0.2666478468293903
                        },
                        "true_positive_rate": {
                            "average": 0.3796875,
                            "stdev": 0.2666478468293903
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 125,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.58125,
                        "precision": 0.728,
                        "recall": 0.7583333333333333,
                        "f1": 0.7428571428571429,
                        "true_negative_rate": 0.0375,
                        "false_positive_rate": 0.2125,
                        "false_negative_rate": 0.18125,
                        "true_positive_rate": 0.56875
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 34,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.3125,
                        "precision": 0.7058823529411765,
                        "recall": 0.2,
                        "f1": 0.3116883116883117,
                        "true_negative_rate": 0.1875,
                        "false_positive_rate": 0.0625,
                        "false_negative_rate": 0.6,
                        "true_positive_rate": 0.15
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 129,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.68125,
                        "precision": 0.7674418604651163,
                        "recall": 0.825,
                        "f1": 0.7951807228915663,
                        "true_negative_rate": 0.0625,
                        "false_positive_rate": 0.1875,
                        "false_negative_rate": 0.13125,
                        "true_positive_rate": 0.61875
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 36,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.36875,
                        "precision": 0.7777777777777778,
                        "recall": 0.23333333333333334,
                        "f1": 0.358974358974359,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.05,
                        "false_negative_rate": 0.575,
                        "true_positive_rate": 0.175
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4859375,
                            "stdev": 0.15086825533805978
                        },
                        "precision": {
                            "average": 0.7447754977960176,
                            "stdev": 0.02914193080273442
                        },
                        "recall": {
                            "average": 0.5041666666666667,
                            "stdev": 0.28870520335533345
                        },
                        "f1": {
                            "average": 0.552175134102845,
                            "stdev": 0.2182726482580985
                        },
                        "true_negative_rate": {
                            "average": 0.121875,
                            "stdev": 0.07255116728902437
                        },
                        "false_positive_rate": {
                            "average": 0.12812500000000002,
                            "stdev": 0.07255116728902437
                        },
                        "false_negative_rate": {
                            "average": 0.37187499999999996,
                            "stdev": 0.21652890251650006
                        },
                        "true_positive_rate": {
                            "average": 0.378125,
                            "stdev": 0.21652890251650012
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 153,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.76875,
                        "precision": 0.7712418300653595,
                        "recall": 0.9833333333333333,
                        "f1": 0.8644688644688645,
                        "true_negative_rate": 0.03125,
                        "false_positive_rate": 0.21875,
                        "false_negative_rate": 0.0125,
                        "true_positive_rate": 0.7375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 125,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.78125,
                        "precision": 0.856,
                        "recall": 0.8916666666666667,
                        "f1": 0.8734693877551021,
                        "true_negative_rate": 0.1375,
                        "false_positive_rate": 0.1125,
                        "false_negative_rate": 0.08125,
                        "true_positive_rate": 0.66875
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 159,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.74375,
                        "precision": 0.7484276729559748,
                        "recall": 0.9916666666666667,
                        "f1": 0.8530465949820788,
                        "true_negative_rate": 0.0,
                        "false_positive_rate": 0.25,
                        "false_negative_rate": 0.00625,
                        "true_positive_rate": 0.74375
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 5,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.28125,
                        "precision": 1.0,
                        "recall": 0.041666666666666664,
                        "f1": 0.08,
                        "true_negative_rate": 0.25,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.71875,
                        "true_positive_rate": 0.03125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.64375,
                            "stdev": 0.20972452169453148
                        },
                        "precision": {
                            "average": 0.8439173757553335,
                            "stdev": 0.09862529300562928
                        },
                        "recall": {
                            "average": 0.7270833333333333,
                            "stdev": 0.3976657149332003
                        },
                        "f1": {
                            "average": 0.6677462118015114,
                            "stdev": 0.3394126060898462
                        },
                        "true_negative_rate": {
                            "average": 0.1046875,
                            "stdev": 0.09816430343434421
                        },
                        "false_positive_rate": {
                            "average": 0.1453125,
                            "stdev": 0.0981643034343442
                        },
                        "false_negative_rate": {
                            "average": 0.2046875,
                            "stdev": 0.2982492861999002
                        },
                        "true_positive_rate": {
                            "average": 0.5453125,
                            "stdev": 0.29824928619990027
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 101,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.56875,
                        "precision": 0.7722772277227723,
                        "recall": 0.65,
                        "f1": 0.7058823529411765,
                        "true_negative_rate": 0.10625,
                        "false_positive_rate": 0.14375,
                        "false_negative_rate": 0.2625,
                        "true_positive_rate": 0.4875
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 83,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.59375,
                        "precision": 0.8433734939759037,
                        "recall": 0.5833333333333334,
                        "f1": 0.6896551724137931,
                        "true_negative_rate": 0.16875,
                        "false_positive_rate": 0.08125,
                        "false_negative_rate": 0.3125,
                        "true_positive_rate": 0.4375
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 57,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.5125,
                        "precision": 0.9122807017543859,
                        "recall": 0.43333333333333335,
                        "f1": 0.5875706214689266,
                        "true_negative_rate": 0.21875,
                        "false_positive_rate": 0.03125,
                        "false_negative_rate": 0.425,
                        "true_positive_rate": 0.325
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 21,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.3625,
                        "precision": 0.9523809523809523,
                        "recall": 0.16666666666666666,
                        "f1": 0.28368794326241137,
                        "true_negative_rate": 0.24375,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.625,
                        "true_positive_rate": 0.125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.509375,
                            "stdev": 0.0897587913521567
                        },
                        "precision": {
                            "average": 0.8700780939585036,
                            "stdev": 0.0686166264581129
                        },
                        "recall": {
                            "average": 0.45833333333333337,
                            "stdev": 0.1857791400800663
                        },
                        "f1": {
                            "average": 0.5666990225215769,
                            "stdev": 0.16957381054356926
                        },
                        "true_negative_rate": {
                            "average": 0.184375,
                            "stdev": 0.052570637003939755
                        },
                        "false_positive_rate": {
                            "average": 0.06562499999999999,
                            "stdev": 0.052570637003939755
                        },
                        "false_negative_rate": {
                            "average": 0.40625,
                            "stdev": 0.1393343550600497
                        },
                        "true_positive_rate": {
                            "average": 0.34375,
                            "stdev": 0.1393343550600497
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 137,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.81875,
                        "precision": 0.8321167883211679,
                        "recall": 0.95,
                        "f1": 0.8871595330739299,
                        "true_negative_rate": 0.10625,
                        "false_positive_rate": 0.14375,
                        "false_negative_rate": 0.0375,
                        "true_positive_rate": 0.7125
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 84,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.6875,
                        "precision": 0.9166666666666666,
                        "recall": 0.6416666666666667,
                        "f1": 0.7549019607843137,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.26875,
                        "true_positive_rate": 0.48125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 32,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.4375,
                        "precision": 0.96875,
                        "recall": 0.25833333333333336,
                        "f1": 0.40789473684210525,
                        "true_negative_rate": 0.24375,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.55625,
                        "true_positive_rate": 0.19375
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 8,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.3,
                        "precision": 1.0,
                        "recall": 0.06666666666666667,
                        "f1": 0.125,
                        "true_negative_rate": 0.25,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.7,
                        "true_positive_rate": 0.05
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5609375,
                            "stdev": 0.20359920608575566
                        },
                        "precision": {
                            "average": 0.9293833637469586,
                            "stdev": 0.06355891449133351
                        },
                        "recall": {
                            "average": 0.4791666666666667,
                            "stdev": 0.34169207222618175
                        },
                        "f1": {
                            "average": 0.5437390576750872,
                            "stdev": 0.29846418231417565
                        },
                        "true_negative_rate": {
                            "average": 0.2015625,
                            "stdev": 0.05751613224783113
                        },
                        "false_positive_rate": {
                            "average": 0.0484375,
                            "stdev": 0.05751613224783113
                        },
                        "false_negative_rate": {
                            "average": 0.390625,
                            "stdev": 0.25626905416963636
                        },
                        "true_positive_rate": {
                            "average": 0.35937500000000006,
                            "stdev": 0.25626905416963636
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 94,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.6875,
                        "precision": 0.8723404255319149,
                        "recall": 0.6833333333333333,
                        "f1": 0.7663551401869159,
                        "true_negative_rate": 0.175,
                        "false_positive_rate": 0.075,
                        "false_negative_rate": 0.2375,
                        "true_positive_rate": 0.5125
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 29,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.40625,
                        "precision": 0.9310344827586207,
                        "recall": 0.225,
                        "f1": 0.3624161073825503,
                        "true_negative_rate": 0.2375,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.58125,
                        "true_positive_rate": 0.16875
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 62,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.5875,
                        "precision": 0.9354838709677419,
                        "recall": 0.48333333333333334,
                        "f1": 0.6373626373626373,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.3875,
                        "true_positive_rate": 0.3625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 21,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.38125,
                        "precision": 1.0,
                        "recall": 0.175,
                        "f1": 0.2978723404255319,
                        "true_negative_rate": 0.25,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.61875,
                        "true_positive_rate": 0.13125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.515625,
                            "stdev": 0.12720707773154763
                        },
                        "precision": {
                            "average": 0.9347146948145694,
                            "stdev": 0.045185329849496685
                        },
                        "recall": {
                            "average": 0.39166666666666666,
                            "stdev": 0.20505757998940474
                        },
                        "f1": {
                            "average": 0.5160015563394089,
                            "stdev": 0.19272666778296402
                        },
                        "true_negative_rate": {
                            "average": 0.221875,
                            "stdev": 0.02847010493482594
                        },
                        "false_positive_rate": {
                            "average": 0.028124999999999997,
                            "stdev": 0.02847010493482593
                        },
                        "false_negative_rate": {
                            "average": 0.45625000000000004,
                            "stdev": 0.15379318499205355
                        },
                        "true_positive_rate": {
                            "average": 0.29375,
                            "stdev": 0.15379318499205352
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 135,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.75625,
                        "precision": 0.8,
                        "recall": 0.9,
                        "f1": 0.8470588235294118,
                        "true_negative_rate": 0.08125,
                        "false_positive_rate": 0.16875,
                        "false_negative_rate": 0.075,
                        "true_positive_rate": 0.675
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 115,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.73125,
                        "precision": 0.8347826086956521,
                        "recall": 0.8,
                        "f1": 0.8170212765957446,
                        "true_negative_rate": 0.13125,
                        "false_positive_rate": 0.11875,
                        "false_negative_rate": 0.15,
                        "true_positive_rate": 0.6
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 96,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.625,
                        "precision": 0.8125,
                        "recall": 0.65,
                        "f1": 0.7222222222222222,
                        "true_negative_rate": 0.1375,
                        "false_positive_rate": 0.1125,
                        "false_negative_rate": 0.2625,
                        "true_positive_rate": 0.4875
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 46,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.4625,
                        "precision": 0.8695652173913043,
                        "recall": 0.3333333333333333,
                        "f1": 0.4819277108433735,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0375,
                        "false_negative_rate": 0.5,
                        "true_positive_rate": 0.25
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6437499999999999,
                            "stdev": 0.1156672220207609
                        },
                        "precision": {
                            "average": 0.8292119565217391,
                            "stdev": 0.026419905956789367
                        },
                        "recall": {
                            "average": 0.6708333333333334,
                            "stdev": 0.2142088155671159
                        },
                        "f1": {
                            "average": 0.717057508297688,
                            "stdev": 0.14335776300496844
                        },
                        "true_negative_rate": {
                            "average": 0.140625,
                            "stdev": 0.04687499999999999
                        },
                        "false_positive_rate": {
                            "average": 0.10937499999999999,
                            "stdev": 0.04687500000000001
                        },
                        "false_negative_rate": {
                            "average": 0.246875,
                            "stdev": 0.1606566116753369
                        },
                        "true_positive_rate": {
                            "average": 0.503125,
                            "stdev": 0.16065661167533693
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 123,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.71875,
                        "precision": 0.8048780487804879,
                        "recall": 0.825,
                        "f1": 0.8148148148148148,
                        "true_negative_rate": 0.1,
                        "false_positive_rate": 0.15,
                        "false_negative_rate": 0.13125,
                        "true_positive_rate": 0.61875
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 33,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.43125,
                        "precision": 0.9393939393939394,
                        "recall": 0.25833333333333336,
                        "f1": 0.40522875816993464,
                        "true_negative_rate": 0.2375,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.55625,
                        "true_positive_rate": 0.19375
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 119,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.61875,
                        "precision": 0.7478991596638656,
                        "recall": 0.7416666666666667,
                        "f1": 0.7447698744769874,
                        "true_negative_rate": 0.0625,
                        "false_positive_rate": 0.1875,
                        "false_negative_rate": 0.19375,
                        "true_positive_rate": 0.55625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 8,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.3,
                        "precision": 1.0,
                        "recall": 0.06666666666666667,
                        "f1": 0.125,
                        "true_negative_rate": 0.25,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.7,
                        "true_positive_rate": 0.05
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5171874999999999,
                            "stdev": 0.16240231438852712
                        },
                        "precision": {
                            "average": 0.8730427869595732,
                            "stdev": 0.1010296594522013
                        },
                        "recall": {
                            "average": 0.47291666666666665,
                            "stdev": 0.3190902323620842
                        },
                        "f1": {
                            "average": 0.5224533618654342,
                            "stdev": 0.27686216494861593
                        },
                        "true_negative_rate": {
                            "average": 0.1625,
                            "stdev": 0.08244316223920574
                        },
                        "false_positive_rate": {
                            "average": 0.0875,
                            "stdev": 0.08244316223920574
                        },
                        "false_negative_rate": {
                            "average": 0.39531249999999996,
                            "stdev": 0.23931767427156314
                        },
                        "true_positive_rate": {
                            "average": 0.3546875,
                            "stdev": 0.23931767427156317
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 105,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.78125,
                        "precision": 0.9047619047619048,
                        "recall": 0.7916666666666666,
                        "f1": 0.8444444444444444,
                        "true_negative_rate": 0.1875,
                        "false_positive_rate": 0.0625,
                        "false_negative_rate": 0.15625,
                        "true_positive_rate": 0.59375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 88,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.7375,
                        "precision": 0.9431818181818182,
                        "recall": 0.6916666666666667,
                        "f1": 0.7980769230769231,
                        "true_negative_rate": 0.21875,
                        "false_positive_rate": 0.03125,
                        "false_negative_rate": 0.23125,
                        "true_positive_rate": 0.51875
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 122,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.8,
                        "precision": 0.860655737704918,
                        "recall": 0.875,
                        "f1": 0.8677685950413223,
                        "true_negative_rate": 0.14375,
                        "false_positive_rate": 0.10625,
                        "false_negative_rate": 0.09375,
                        "true_positive_rate": 0.65625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 79,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.69375,
                        "precision": 0.9493670886075949,
                        "recall": 0.625,
                        "f1": 0.7537688442211056,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.28125,
                        "true_positive_rate": 0.46875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.753125,
                            "stdev": 0.04110295761864347
                        },
                        "precision": {
                            "average": 0.914491637314059,
                            "stdev": 0.03546968904365561
                        },
                        "recall": {
                            "average": 0.7458333333333333,
                            "stdev": 0.0952883052169106
                        },
                        "f1": {
                            "average": 0.8160147016959488,
                            "stdev": 0.04382643729449614
                        },
                        "true_negative_rate": {
                            "average": 0.19375,
                            "stdev": 0.03217384419058438
                        },
                        "false_positive_rate": {
                            "average": 0.05625,
                            "stdev": 0.032173844190584375
                        },
                        "false_negative_rate": {
                            "average": 0.190625,
                            "stdev": 0.07146622891268295
                        },
                        "true_positive_rate": {
                            "average": 0.559375,
                            "stdev": 0.07146622891268294
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 113,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.84375,
                        "precision": 0.9203539823008849,
                        "recall": 0.8666666666666667,
                        "f1": 0.8927038626609443,
                        "true_negative_rate": 0.19375,
                        "false_positive_rate": 0.05625,
                        "false_negative_rate": 0.1,
                        "true_positive_rate": 0.65
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 109,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.84375,
                        "precision": 0.9357798165137615,
                        "recall": 0.85,
                        "f1": 0.8908296943231441,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.1125,
                        "true_positive_rate": 0.6375
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 107,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.83125,
                        "precision": 0.9345794392523364,
                        "recall": 0.8333333333333334,
                        "f1": 0.8810572687224669,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.125,
                        "true_positive_rate": 0.625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 97,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.78125,
                        "precision": 0.9381443298969072,
                        "recall": 0.7583333333333333,
                        "f1": 0.8387096774193549,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0375,
                        "false_negative_rate": 0.18125,
                        "true_positive_rate": 0.56875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.825,
                            "stdev": 0.025769410160110383
                        },
                        "precision": {
                            "average": 0.9322143919909724,
                            "stdev": 0.006966691864887426
                        },
                        "recall": {
                            "average": 0.8270833333333334,
                            "stdev": 0.041405431071566245
                        },
                        "f1": {
                            "average": 0.8758251257814775,
                            "stdev": 0.021880131756025704
                        },
                        "true_negative_rate": {
                            "average": 0.2046875,
                            "stdev": 0.006810779599282298
                        },
                        "false_positive_rate": {
                            "average": 0.0453125,
                            "stdev": 0.006810779599282304
                        },
                        "false_negative_rate": {
                            "average": 0.1296875,
                            "stdev": 0.03105407330367467
                        },
                        "true_positive_rate": {
                            "average": 0.6203125,
                            "stdev": 0.03105407330367468
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 125,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.83125,
                        "precision": 0.872,
                        "recall": 0.9083333333333333,
                        "f1": 0.889795918367347,
                        "true_negative_rate": 0.15,
                        "false_positive_rate": 0.1,
                        "false_negative_rate": 0.06875,
                        "true_positive_rate": 0.68125
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 119,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.83125,
                        "precision": 0.8907563025210085,
                        "recall": 0.8833333333333333,
                        "f1": 0.8870292887029289,
                        "true_negative_rate": 0.16875,
                        "false_positive_rate": 0.08125,
                        "false_negative_rate": 0.0875,
                        "true_positive_rate": 0.6625
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 115,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.85625,
                        "precision": 0.9217391304347826,
                        "recall": 0.8833333333333333,
                        "f1": 0.902127659574468,
                        "true_negative_rate": 0.19375,
                        "false_positive_rate": 0.05625,
                        "false_negative_rate": 0.0875,
                        "true_positive_rate": 0.6625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 109,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.83125,
                        "precision": 0.926605504587156,
                        "recall": 0.8416666666666667,
                        "f1": 0.8820960698689956,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.05,
                        "false_negative_rate": 0.11875,
                        "true_positive_rate": 0.63125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.8374999999999999,
                            "stdev": 0.010825317547305445
                        },
                        "precision": {
                            "average": 0.9027752343857368,
                            "stdev": 0.022467092078736407
                        },
                        "recall": {
                            "average": 0.8791666666666667,
                            "stdev": 0.023935677693908447
                        },
                        "f1": {
                            "average": 0.8902622341284349,
                            "stdev": 0.007384848445311094
                        },
                        "true_negative_rate": {
                            "average": 0.17812499999999998,
                            "stdev": 0.020009763241977657
                        },
                        "false_positive_rate": {
                            "average": 0.07187500000000001,
                            "stdev": 0.020009763241977653
                        },
                        "false_negative_rate": {
                            "average": 0.090625,
                            "stdev": 0.017951758270431335
                        },
                        "true_positive_rate": {
                            "average": 0.659375,
                            "stdev": 0.017951758270431353
                        }
                    }
                }
            }
        }
    },
    "finegrained_fact_verification": {
        "initial_model=gpt-4-0613": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 111,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.5495495495495496,
                        "recall": 0.782051282051282,
                        "f1": 0.6455026455026455,
                        "true_negative_rate": 0.08571428571428572,
                        "false_positive_rate": 0.35714285714285715,
                        "false_negative_rate": 0.12142857142857143,
                        "true_positive_rate": 0.4357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 92,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.532608695652174,
                        "recall": 0.6282051282051282,
                        "f1": 0.5764705882352941,
                        "true_negative_rate": 0.1357142857142857,
                        "false_positive_rate": 0.30714285714285716,
                        "false_negative_rate": 0.20714285714285716,
                        "true_positive_rate": 0.35
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 92,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5428571428571428,
                        "precision": 0.5760869565217391,
                        "recall": 0.6794871794871795,
                        "f1": 0.6235294117647059,
                        "true_negative_rate": 0.16428571428571428,
                        "false_positive_rate": 0.2785714285714286,
                        "false_negative_rate": 0.17857142857142858,
                        "true_positive_rate": 0.37857142857142856
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 55,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5214285714285715,
                        "precision": 0.6,
                        "recall": 0.4230769230769231,
                        "f1": 0.49624060150375937,
                        "true_negative_rate": 0.2857142857142857,
                        "false_positive_rate": 0.15714285714285714,
                        "false_negative_rate": 0.32142857142857145,
                        "true_positive_rate": 0.2357142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5107142857142857,
                            "stdev": 0.022868300847974445
                        },
                        "precision": {
                            "average": 0.5645613004308657,
                            "stdev": 0.02566643621610924
                        },
                        "recall": {
                            "average": 0.6282051282051281,
                            "stdev": 0.1307440900921227
                        },
                        "f1": {
                            "average": 0.5854358117516012,
                            "stdev": 0.05721736546427154
                        },
                        "true_negative_rate": {
                            "average": 0.16785714285714284,
                            "stdev": 0.07362688617174393
                        },
                        "false_positive_rate": {
                            "average": 0.275,
                            "stdev": 0.07362688617174394
                        },
                        "false_negative_rate": {
                            "average": 0.20714285714285713,
                            "stdev": 0.07284313590846836
                        },
                        "true_positive_rate": {
                            "average": 0.35,
                            "stdev": 0.07284313590846836
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 136,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.5588235294117647,
                        "recall": 0.9743589743589743,
                        "f1": 0.7102803738317757,
                        "true_negative_rate": 0.014285714285714285,
                        "false_positive_rate": 0.42857142857142855,
                        "false_negative_rate": 0.014285714285714285,
                        "true_positive_rate": 0.5428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 87,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.42857142857142855,
                        "precision": 0.4942528735632184,
                        "recall": 0.5512820512820513,
                        "f1": 0.5212121212121212,
                        "true_negative_rate": 0.12857142857142856,
                        "false_positive_rate": 0.3142857142857143,
                        "false_negative_rate": 0.25,
                        "true_positive_rate": 0.30714285714285716
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 130,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5357142857142857,
                        "precision": 0.5538461538461539,
                        "recall": 0.9230769230769231,
                        "f1": 0.6923076923076923,
                        "true_negative_rate": 0.02857142857142857,
                        "false_positive_rate": 0.4142857142857143,
                        "false_negative_rate": 0.04285714285714286,
                        "true_positive_rate": 0.5142857142857142
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 62,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.44285714285714284,
                        "precision": 0.5161290322580645,
                        "recall": 0.41025641025641024,
                        "f1": 0.45714285714285713,
                        "true_negative_rate": 0.22857142857142856,
                        "false_positive_rate": 0.21428571428571427,
                        "false_negative_rate": 0.32857142857142857,
                        "true_positive_rate": 0.22857142857142856
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.48928571428571427,
                            "stdev": 0.05404552125150558
                        },
                        "precision": {
                            "average": 0.5307628972698004,
                            "stdev": 0.026773903556510464
                        },
                        "recall": {
                            "average": 0.7147435897435898,
                            "stdev": 0.23991406937435053
                        },
                        "f1": {
                            "average": 0.5952357611236115,
                            "stdev": 0.1086362878284257
                        },
                        "true_negative_rate": {
                            "average": 0.1,
                            "stdev": 0.08630747123996123
                        },
                        "false_positive_rate": {
                            "average": 0.34285714285714286,
                            "stdev": 0.08630747123996123
                        },
                        "false_negative_rate": {
                            "average": 0.15892857142857142,
                            "stdev": 0.1336664100799953
                        },
                        "true_positive_rate": {
                            "average": 0.39821428571428574,
                            "stdev": 0.13366641007999527
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5571428571428572,
                        "precision": 0.5571428571428572,
                        "recall": 1.0,
                        "f1": 0.7155963302752294,
                        "true_negative_rate": 0.0,
                        "false_positive_rate": 0.44285714285714284,
                        "false_negative_rate": 0.0,
                        "true_positive_rate": 0.5571428571428572
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 123,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5571428571428572,
                        "precision": 0.5691056910569106,
                        "recall": 0.8974358974358975,
                        "f1": 0.6965174129353234,
                        "true_negative_rate": 0.06428571428571428,
                        "false_positive_rate": 0.37857142857142856,
                        "false_negative_rate": 0.05714285714285714,
                        "true_positive_rate": 0.5
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 137,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5642857142857143,
                        "precision": 0.5620437956204379,
                        "recall": 0.9871794871794872,
                        "f1": 0.7162790697674418,
                        "true_negative_rate": 0.014285714285714285,
                        "false_positive_rate": 0.42857142857142855,
                        "false_negative_rate": 0.007142857142857143,
                        "true_positive_rate": 0.55
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 66,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 0.5303030303030303,
                        "recall": 0.44871794871794873,
                        "f1": 0.4861111111111111,
                        "true_negative_rate": 0.22142857142857142,
                        "false_positive_rate": 0.22142857142857142,
                        "false_negative_rate": 0.30714285714285716,
                        "true_positive_rate": 0.25
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5375,
                            "stdev": 0.038257652295765276
                        },
                        "precision": {
                            "average": 0.554648843530809,
                            "stdev": 0.014685234782084887
                        },
                        "recall": {
                            "average": 0.8333333333333334,
                            "stdev": 0.22554630879178436
                        },
                        "f1": {
                            "average": 0.6536259810222764,
                            "stdev": 0.09703947687506967
                        },
                        "true_negative_rate": {
                            "average": 0.075,
                            "stdev": 0.08784552768749172
                        },
                        "false_positive_rate": {
                            "average": 0.3678571428571429,
                            "stdev": 0.08784552768749174
                        },
                        "false_negative_rate": {
                            "average": 0.09285714285714286,
                            "stdev": 0.12566151489827987
                        },
                        "true_positive_rate": {
                            "average": 0.4642857142857143,
                            "stdev": 0.12566151489827987
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 107,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5357142857142857,
                        "precision": 0.5794392523364486,
                        "recall": 0.7948717948717948,
                        "f1": 0.6702702702702703,
                        "true_negative_rate": 0.12142857142857143,
                        "false_positive_rate": 0.32142857142857145,
                        "false_negative_rate": 0.11428571428571428,
                        "true_positive_rate": 0.44285714285714284
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 78,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.6,
                        "precision": 0.6410256410256411,
                        "recall": 0.6410256410256411,
                        "f1": 0.6410256410256411,
                        "true_negative_rate": 0.24285714285714285,
                        "false_positive_rate": 0.2,
                        "false_negative_rate": 0.2,
                        "true_positive_rate": 0.35714285714285715
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 29,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.45,
                        "precision": 0.5517241379310345,
                        "recall": 0.20512820512820512,
                        "f1": 0.29906542056074764,
                        "true_negative_rate": 0.35,
                        "false_positive_rate": 0.09285714285714286,
                        "false_negative_rate": 0.44285714285714284,
                        "true_positive_rate": 0.11428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 16,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.38571428571428573,
                        "precision": 0.3125,
                        "recall": 0.0641025641025641,
                        "f1": 0.10638297872340426,
                        "true_negative_rate": 0.36428571428571427,
                        "false_positive_rate": 0.07857142857142857,
                        "false_negative_rate": 0.5214285714285715,
                        "true_positive_rate": 0.03571428571428571
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4928571428571428,
                            "stdev": 0.081597568991399
                        },
                        "precision": {
                            "average": 0.521172257823281,
                            "stdev": 0.12473712827165141
                        },
                        "recall": {
                            "average": 0.42628205128205127,
                            "stdev": 0.30085553552234645
                        },
                        "f1": {
                            "average": 0.4291860776450159,
                            "stdev": 0.23671228032175592
                        },
                        "true_negative_rate": {
                            "average": 0.26964285714285713,
                            "stdev": 0.09759545301385505
                        },
                        "false_positive_rate": {
                            "average": 0.1732142857142857,
                            "stdev": 0.09759545301385507
                        },
                        "false_negative_rate": {
                            "average": 0.3196428571428571,
                            "stdev": 0.16761951264816446
                        },
                        "true_positive_rate": {
                            "average": 0.23750000000000002,
                            "stdev": 0.16761951264816446
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 116,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5428571428571428,
                        "precision": 0.5603448275862069,
                        "recall": 0.8333333333333334,
                        "f1": 0.6701030927835051,
                        "true_negative_rate": 0.07857142857142857,
                        "false_positive_rate": 0.36428571428571427,
                        "false_negative_rate": 0.09285714285714286,
                        "true_positive_rate": 0.4642857142857143
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 122,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.5655737704918032,
                        "recall": 0.8846153846153846,
                        "f1": 0.69,
                        "true_negative_rate": 0.06428571428571428,
                        "false_positive_rate": 0.37857142857142856,
                        "false_negative_rate": 0.06428571428571428,
                        "true_positive_rate": 0.4928571428571429
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 72,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.5555555555555556,
                        "recall": 0.5128205128205128,
                        "f1": 0.5333333333333333,
                        "true_negative_rate": 0.21428571428571427,
                        "false_positive_rate": 0.22857142857142856,
                        "false_negative_rate": 0.2714285714285714,
                        "true_positive_rate": 0.2857142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 36,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.45,
                        "precision": 0.5277777777777778,
                        "recall": 0.24358974358974358,
                        "f1": 0.3333333333333333,
                        "true_negative_rate": 0.32142857142857145,
                        "false_positive_rate": 0.12142857142857143,
                        "false_negative_rate": 0.42142857142857143,
                        "true_positive_rate": 0.1357142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5107142857142858,
                            "stdev": 0.03992978531249624
                        },
                        "precision": {
                            "average": 0.5523129828528359,
                            "stdev": 0.01460179427702592
                        },
                        "recall": {
                            "average": 0.6185897435897436,
                            "stdev": 0.2591797548018465
                        },
                        "f1": {
                            "average": 0.5566924398625429,
                            "stdev": 0.1423620561395578
                        },
                        "true_negative_rate": {
                            "average": 0.16964285714285715,
                            "stdev": 0.1053874048548536
                        },
                        "false_positive_rate": {
                            "average": 0.2732142857142857,
                            "stdev": 0.10538740485485357
                        },
                        "false_negative_rate": {
                            "average": 0.2125,
                            "stdev": 0.14440014910388593
                        },
                        "true_positive_rate": {
                            "average": 0.34464285714285714,
                            "stdev": 0.14440014910388596
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 49,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.5714285714285714,
                        "recall": 0.358974358974359,
                        "f1": 0.4409448818897638,
                        "true_negative_rate": 0.29285714285714287,
                        "false_positive_rate": 0.15,
                        "false_negative_rate": 0.35714285714285715,
                        "true_positive_rate": 0.2
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 23,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.6521739130434783,
                        "recall": 0.19230769230769232,
                        "f1": 0.297029702970297,
                        "true_negative_rate": 0.38571428571428573,
                        "false_positive_rate": 0.05714285714285714,
                        "false_negative_rate": 0.45,
                        "true_positive_rate": 0.10714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 15,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4642857142857143,
                        "precision": 0.6,
                        "recall": 0.11538461538461539,
                        "f1": 0.1935483870967742,
                        "true_negative_rate": 0.4,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.4928571428571429,
                        "true_positive_rate": 0.06428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 4,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.45714285714285713,
                        "precision": 0.75,
                        "recall": 0.038461538461538464,
                        "f1": 0.07317073170731707,
                        "true_negative_rate": 0.4357142857142857,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5357142857142857,
                        "true_positive_rate": 0.02142857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.47500000000000003,
                            "stdev": 0.014725377234348797
                        },
                        "precision": {
                            "average": 0.6434006211180124,
                            "stdev": 0.06801468238395961
                        },
                        "recall": {
                            "average": 0.1762820512820513,
                            "stdev": 0.11867633706136935
                        },
                        "f1": {
                            "average": 0.25117342591603803,
                            "stdev": 0.13520503678871376
                        },
                        "true_negative_rate": {
                            "average": 0.3785714285714286,
                            "stdev": 0.052731510929405
                        },
                        "false_positive_rate": {
                            "average": 0.06428571428571428,
                            "stdev": 0.052731510929405
                        },
                        "false_negative_rate": {
                            "average": 0.45892857142857146,
                            "stdev": 0.06611967350562006
                        },
                        "true_positive_rate": {
                            "average": 0.09821428571428571,
                            "stdev": 0.06611967350562006
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 91,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5785714285714286,
                        "precision": 0.6043956043956044,
                        "recall": 0.7051282051282052,
                        "f1": 0.650887573964497,
                        "true_negative_rate": 0.18571428571428572,
                        "false_positive_rate": 0.2571428571428571,
                        "false_negative_rate": 0.16428571428571428,
                        "true_positive_rate": 0.39285714285714285
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 63,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.5555555555555556,
                        "recall": 0.44871794871794873,
                        "f1": 0.49645390070921985,
                        "true_negative_rate": 0.24285714285714285,
                        "false_positive_rate": 0.2,
                        "false_negative_rate": 0.30714285714285716,
                        "true_positive_rate": 0.25
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 42,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5714285714285714,
                        "precision": 0.7142857142857143,
                        "recall": 0.38461538461538464,
                        "f1": 0.5,
                        "true_negative_rate": 0.35714285714285715,
                        "false_positive_rate": 0.08571428571428572,
                        "false_negative_rate": 0.34285714285714286,
                        "true_positive_rate": 0.21428571428571427
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 2,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.42857142857142855,
                        "precision": 0.0,
                        "recall": 0.0,
                        "f1": 0.0,
                        "true_negative_rate": 0.42857142857142855,
                        "false_positive_rate": 0.014285714285714285,
                        "false_negative_rate": 0.5571428571428572,
                        "true_positive_rate": 0.0
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5178571428571429,
                            "stdev": 0.061548885498621746
                        },
                        "precision": {
                            "average": 0.46855921855921856,
                            "stdev": 0.2765633464356407
                        },
                        "recall": {
                            "average": 0.38461538461538464,
                            "stdev": 0.25237204724396833
                        },
                        "f1": {
                            "average": 0.41183536866842924,
                            "stdev": 0.24580867861293287
                        },
                        "true_negative_rate": {
                            "average": 0.3035714285714286,
                            "stdev": 0.09496239857363092
                        },
                        "false_positive_rate": {
                            "average": 0.13928571428571426,
                            "stdev": 0.09496239857363092
                        },
                        "false_negative_rate": {
                            "average": 0.34285714285714286,
                            "stdev": 0.14060728346449664
                        },
                        "true_positive_rate": {
                            "average": 0.21428571428571427,
                            "stdev": 0.14060728346449664
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 84,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5428571428571428,
                        "precision": 0.5833333333333334,
                        "recall": 0.6282051282051282,
                        "f1": 0.6049382716049383,
                        "true_negative_rate": 0.19285714285714287,
                        "false_positive_rate": 0.25,
                        "false_negative_rate": 0.20714285714285716,
                        "true_positive_rate": 0.35
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 33,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4785714285714286,
                        "precision": 0.5757575757575758,
                        "recall": 0.24358974358974358,
                        "f1": 0.34234234234234234,
                        "true_negative_rate": 0.34285714285714286,
                        "false_positive_rate": 0.1,
                        "false_negative_rate": 0.42142857142857143,
                        "true_positive_rate": 0.1357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 76,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5428571428571428,
                        "precision": 0.5921052631578947,
                        "recall": 0.5769230769230769,
                        "f1": 0.5844155844155844,
                        "true_negative_rate": 0.22142857142857142,
                        "false_positive_rate": 0.22142857142857142,
                        "false_negative_rate": 0.2357142857142857,
                        "true_positive_rate": 0.32142857142857145
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 18,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.7222222222222222,
                        "recall": 0.16666666666666666,
                        "f1": 0.2708333333333333,
                        "true_negative_rate": 0.40714285714285714,
                        "false_positive_rate": 0.03571428571428571,
                        "false_negative_rate": 0.4642857142857143,
                        "true_positive_rate": 0.09285714285714286
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5160714285714285,
                            "stdev": 0.02783653083592836
                        },
                        "precision": {
                            "average": 0.6183545986177565,
                            "stdev": 0.06024638209883648
                        },
                        "recall": {
                            "average": 0.40384615384615385,
                            "stdev": 0.20138818041544881
                        },
                        "f1": {
                            "average": 0.45063238292404956,
                            "stdev": 0.146426332507029
                        },
                        "true_negative_rate": {
                            "average": 0.29107142857142854,
                            "stdev": 0.08753643556268674
                        },
                        "false_positive_rate": {
                            "average": 0.15178571428571427,
                            "stdev": 0.08753643556268674
                        },
                        "false_negative_rate": {
                            "average": 0.3321428571428572,
                            "stdev": 0.11220198623146435
                        },
                        "true_positive_rate": {
                            "average": 0.225,
                            "stdev": 0.11220198623146435
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 62,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.6285714285714286,
                        "precision": 0.7096774193548387,
                        "recall": 0.5641025641025641,
                        "f1": 0.6285714285714286,
                        "true_negative_rate": 0.3142857142857143,
                        "false_positive_rate": 0.12857142857142856,
                        "false_negative_rate": 0.24285714285714285,
                        "true_positive_rate": 0.3142857142857143
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 29,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5071428571428571,
                        "precision": 0.6551724137931034,
                        "recall": 0.24358974358974358,
                        "f1": 0.35514018691588783,
                        "true_negative_rate": 0.37142857142857144,
                        "false_positive_rate": 0.07142857142857142,
                        "false_negative_rate": 0.42142857142857143,
                        "true_positive_rate": 0.1357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 67,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.6214285714285714,
                        "precision": 0.6865671641791045,
                        "recall": 0.5897435897435898,
                        "f1": 0.6344827586206897,
                        "true_negative_rate": 0.29285714285714287,
                        "false_positive_rate": 0.15,
                        "false_negative_rate": 0.22857142857142856,
                        "true_positive_rate": 0.32857142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 19,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5357142857142857,
                        "precision": 0.8421052631578947,
                        "recall": 0.20512820512820512,
                        "f1": 0.32989690721649484,
                        "true_negative_rate": 0.42142857142857143,
                        "false_positive_rate": 0.02142857142857143,
                        "false_negative_rate": 0.44285714285714284,
                        "true_positive_rate": 0.11428571428571428
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5732142857142857,
                            "stdev": 0.05282214092053229
                        },
                        "precision": {
                            "average": 0.7233805651212353,
                            "stdev": 0.07122308004488166
                        },
                        "recall": {
                            "average": 0.4006410256410256,
                            "stdev": 0.17703800614924972
                        },
                        "f1": {
                            "average": 0.48702282033112526,
                            "stdev": 0.144794702478866
                        },
                        "true_negative_rate": {
                            "average": 0.35000000000000003,
                            "stdev": 0.05025445456953674
                        },
                        "false_positive_rate": {
                            "average": 0.09285714285714285,
                            "stdev": 0.05025445456953674
                        },
                        "false_negative_rate": {
                            "average": 0.3339285714285714,
                            "stdev": 0.09863546056886768
                        },
                        "true_positive_rate": {
                            "average": 0.2232142857142857,
                            "stdev": 0.0986354605688677
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 8,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.875,
                        "recall": 0.08974358974358974,
                        "f1": 0.16279069767441862,
                        "true_negative_rate": 0.4357142857142857,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5071428571428571,
                        "true_positive_rate": 0.05
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 7,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4785714285714286,
                        "precision": 0.8571428571428571,
                        "recall": 0.07692307692307693,
                        "f1": 0.1411764705882353,
                        "true_negative_rate": 0.4357142857142857,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5142857142857142,
                        "true_positive_rate": 0.04285714285714286
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 5,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4642857142857143,
                        "precision": 0.8,
                        "recall": 0.05128205128205128,
                        "f1": 0.0963855421686747,
                        "true_negative_rate": 0.4357142857142857,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5285714285714286,
                        "true_positive_rate": 0.02857142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 4,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 1.0,
                        "recall": 0.05128205128205128,
                        "f1": 0.0975609756097561,
                        "true_negative_rate": 0.44285714285714284,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.5285714285714286,
                        "true_positive_rate": 0.02857142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.475,
                            "stdev": 0.007985957062499245
                        },
                        "precision": {
                            "average": 0.8830357142857144,
                            "stdev": 0.07299073012152617
                        },
                        "recall": {
                            "average": 0.0673076923076923,
                            "stdev": 0.01665433468816228
                        },
                        "f1": {
                            "average": 0.12447842151027116,
                            "stdev": 0.028550018000855323
                        },
                        "true_negative_rate": {
                            "average": 0.4375,
                            "stdev": 0.0030929478706586983
                        },
                        "false_positive_rate": {
                            "average": 0.005357142857142857,
                            "stdev": 0.003092947870658709
                        },
                        "false_negative_rate": {
                            "average": 0.5196428571428571,
                            "stdev": 0.009278843611976149
                        },
                        "true_positive_rate": {
                            "average": 0.0375,
                            "stdev": 0.00927884361197613
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 15,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.7333333333333333,
                        "recall": 0.14102564102564102,
                        "f1": 0.23655913978494625,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.02857142857142857,
                        "false_negative_rate": 0.4785714285714286,
                        "true_positive_rate": 0.07857142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 15,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.7333333333333333,
                        "recall": 0.14102564102564102,
                        "f1": 0.23655913978494625,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.02857142857142857,
                        "false_negative_rate": 0.4785714285714286,
                        "true_positive_rate": 0.07857142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 11,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5214285714285715,
                        "precision": 1.0,
                        "recall": 0.14102564102564102,
                        "f1": 0.24719101123595505,
                        "true_negative_rate": 0.44285714285714284,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.4785714285714286,
                        "true_positive_rate": 0.07857142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 4,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.45714285714285713,
                        "precision": 0.75,
                        "recall": 0.038461538461538464,
                        "f1": 0.07317073170731707,
                        "true_negative_rate": 0.4357142857142857,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5357142857142857,
                        "true_positive_rate": 0.02142857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.49107142857142855,
                            "stdev": 0.02279847381214949
                        },
                        "precision": {
                            "average": 0.8041666666666667,
                            "stdev": 0.11326897682556823
                        },
                        "recall": {
                            "average": 0.11538461538461539,
                            "stdev": 0.04441155916843275
                        },
                        "f1": {
                            "average": 0.19837000562829113,
                            "stdev": 0.07241403302218545
                        },
                        "true_negative_rate": {
                            "average": 0.42678571428571427,
                            "stdev": 0.01275255076525507
                        },
                        "false_positive_rate": {
                            "average": 0.01607142857142857,
                            "stdev": 0.01275255076525509
                        },
                        "false_negative_rate": {
                            "average": 0.4928571428571429,
                            "stdev": 0.02474358296526966
                        },
                        "true_positive_rate": {
                            "average": 0.06428571428571428,
                            "stdev": 0.024743582965269673
                        }
                    }
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 119,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.70625,
                        "precision": 0.8319327731092437,
                        "recall": 0.7857142857142857,
                        "f1": 0.8081632653061225,
                        "true_negative_rate": 0.0875,
                        "false_positive_rate": 0.125,
                        "false_negative_rate": 0.16875,
                        "true_positive_rate": 0.61875
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 87,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.60625,
                        "precision": 0.8620689655172413,
                        "recall": 0.5952380952380952,
                        "f1": 0.704225352112676,
                        "true_negative_rate": 0.1375,
                        "false_positive_rate": 0.075,
                        "false_negative_rate": 0.31875,
                        "true_positive_rate": 0.46875
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 126,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.6875,
                        "precision": 0.8015873015873016,
                        "recall": 0.8015873015873016,
                        "f1": 0.8015873015873016,
                        "true_negative_rate": 0.05625,
                        "false_positive_rate": 0.15625,
                        "false_negative_rate": 0.15625,
                        "true_positive_rate": 0.63125
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 47,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.38125,
                        "precision": 0.7872340425531915,
                        "recall": 0.29365079365079366,
                        "f1": 0.4277456647398844,
                        "true_negative_rate": 0.15,
                        "false_positive_rate": 0.0625,
                        "false_negative_rate": 0.55625,
                        "true_positive_rate": 0.23125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5953125,
                            "stdev": 0.1291782168894973
                        },
                        "precision": {
                            "average": 0.8207057706917446,
                            "stdev": 0.028822005112991694
                        },
                        "recall": {
                            "average": 0.6190476190476191,
                            "stdev": 0.2046634437813909
                        },
                        "f1": {
                            "average": 0.685430395936496,
                            "stdev": 0.15436195695799407
                        },
                        "true_negative_rate": {
                            "average": 0.1078125,
                            "stdev": 0.037856379498705364
                        },
                        "false_positive_rate": {
                            "average": 0.1046875,
                            "stdev": 0.037856379498705364
                        },
                        "false_negative_rate": {
                            "average": 0.30000000000000004,
                            "stdev": 0.16117246197784535
                        },
                        "true_positive_rate": {
                            "average": 0.4875,
                            "stdev": 0.16117246197784535
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 155,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.78125,
                        "precision": 0.7935483870967742,
                        "recall": 0.9761904761904762,
                        "f1": 0.8754448398576512,
                        "true_negative_rate": 0.0125,
                        "false_positive_rate": 0.2,
                        "false_negative_rate": 0.01875,
                        "true_positive_rate": 0.76875
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 96,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.54375,
                        "precision": 0.8020833333333334,
                        "recall": 0.6111111111111112,
                        "f1": 0.6936936936936937,
                        "true_negative_rate": 0.09375,
                        "false_positive_rate": 0.11875,
                        "false_negative_rate": 0.30625,
                        "true_positive_rate": 0.48125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 141,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.74375,
                        "precision": 0.8014184397163121,
                        "recall": 0.8968253968253969,
                        "f1": 0.846441947565543,
                        "true_negative_rate": 0.0375,
                        "false_positive_rate": 0.175,
                        "false_negative_rate": 0.08125,
                        "true_positive_rate": 0.70625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 97,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.56875,
                        "precision": 0.7938144329896907,
                        "recall": 0.6111111111111112,
                        "f1": 0.6905829596412556,
                        "true_negative_rate": 0.0875,
                        "false_positive_rate": 0.125,
                        "false_negative_rate": 0.30625,
                        "true_positive_rate": 0.48125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.659375,
                            "stdev": 0.10434879910665001
                        },
                        "precision": {
                            "average": 0.7977161482840276,
                            "stdev": 0.004042674939210592
                        },
                        "recall": {
                            "average": 0.7738095238095238,
                            "stdev": 0.16510034970408508
                        },
                        "f1": {
                            "average": 0.7765408601895358,
                            "stdev": 0.08503024884356888
                        },
                        "true_negative_rate": {
                            "average": 0.057812499999999996,
                            "stdev": 0.03405389799641151
                        },
                        "false_positive_rate": {
                            "average": 0.15468749999999998,
                            "stdev": 0.034053897996411515
                        },
                        "false_negative_rate": {
                            "average": 0.178125,
                            "stdev": 0.130016525391967
                        },
                        "true_positive_rate": {
                            "average": 0.609375,
                            "stdev": 0.13001652539196704
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 158,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.775,
                        "precision": 0.7848101265822784,
                        "recall": 0.9841269841269841,
                        "f1": 0.8732394366197183,
                        "true_negative_rate": 0.0,
                        "false_positive_rate": 0.2125,
                        "false_negative_rate": 0.0125,
                        "true_positive_rate": 0.775
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 127,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.70625,
                        "precision": 0.8110236220472441,
                        "recall": 0.8174603174603174,
                        "f1": 0.8142292490118577,
                        "true_negative_rate": 0.0625,
                        "false_positive_rate": 0.15,
                        "false_negative_rate": 0.14375,
                        "true_positive_rate": 0.64375
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 146,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.75,
                        "precision": 0.7945205479452054,
                        "recall": 0.9206349206349206,
                        "f1": 0.8529411764705882,
                        "true_negative_rate": 0.025,
                        "false_positive_rate": 0.1875,
                        "false_negative_rate": 0.0625,
                        "true_positive_rate": 0.725
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 88,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.575,
                        "precision": 0.8295454545454546,
                        "recall": 0.5793650793650794,
                        "f1": 0.6822429906542056,
                        "true_negative_rate": 0.11875,
                        "false_positive_rate": 0.09375,
                        "false_negative_rate": 0.33125,
                        "true_positive_rate": 0.45625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.7015625000000001,
                            "stdev": 0.07710268619672082
                        },
                        "precision": {
                            "average": 0.8049749377800457,
                            "stdev": 0.01700154958337551
                        },
                        "recall": {
                            "average": 0.8253968253968254,
                            "stdev": 0.15399688818993848
                        },
                        "f1": {
                            "average": 0.8056632131890924,
                            "stdev": 0.07434325807608619
                        },
                        "true_negative_rate": {
                            "average": 0.0515625,
                            "stdev": 0.044715900066419324
                        },
                        "false_positive_rate": {
                            "average": 0.1609375,
                            "stdev": 0.044715900066419324
                        },
                        "false_negative_rate": {
                            "average": 0.1375,
                            "stdev": 0.12127254944957659
                        },
                        "true_positive_rate": {
                            "average": 0.65,
                            "stdev": 0.1212725494495766
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 63,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.51875,
                        "precision": 0.8888888888888888,
                        "recall": 0.4444444444444444,
                        "f1": 0.5925925925925926,
                        "true_negative_rate": 0.16875,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.4375,
                        "true_positive_rate": 0.35
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 54,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.51875,
                        "precision": 0.9629629629629629,
                        "recall": 0.4126984126984127,
                        "f1": 0.5777777777777777,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.4625,
                        "true_positive_rate": 0.325
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 13,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.2875,
                        "precision": 1.0,
                        "recall": 0.10317460317460317,
                        "f1": 0.18705035971223022,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.70625,
                        "true_positive_rate": 0.08125
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 4,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.2375,
                        "precision": 1.0,
                        "recall": 0.031746031746031744,
                        "f1": 0.06153846153846154,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.7625,
                        "true_positive_rate": 0.025
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.39062500000000006,
                            "stdev": 0.129338763041093
                        },
                        "precision": {
                            "average": 0.9629629629629629,
                            "stdev": 0.04536092116265147
                        },
                        "recall": {
                            "average": 0.248015873015873,
                            "stdev": 0.18265826031271792
                        },
                        "f1": {
                            "average": 0.3547397979052655,
                            "stdev": 0.23473744041745792
                        },
                        "true_negative_rate": {
                            "average": 0.19843750000000002,
                            "stdev": 0.01788362990978061
                        },
                        "false_positive_rate": {
                            "average": 0.014062499999999999,
                            "stdev": 0.01788362990978062
                        },
                        "false_negative_rate": {
                            "average": 0.5921875000000001,
                            "stdev": 0.14384337999626537
                        },
                        "true_positive_rate": {
                            "average": 0.19531250000000003,
                            "stdev": 0.14384337999626537
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 143,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.79375,
                        "precision": 0.8251748251748252,
                        "recall": 0.9365079365079365,
                        "f1": 0.8773234200743495,
                        "true_negative_rate": 0.05625,
                        "false_positive_rate": 0.15625,
                        "false_negative_rate": 0.05,
                        "true_positive_rate": 0.7375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 142,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.7875,
                        "precision": 0.823943661971831,
                        "recall": 0.9285714285714286,
                        "f1": 0.8731343283582089,
                        "true_negative_rate": 0.05625,
                        "false_positive_rate": 0.15625,
                        "false_negative_rate": 0.05625,
                        "true_positive_rate": 0.73125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 70,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.475,
                        "precision": 0.8,
                        "recall": 0.4444444444444444,
                        "f1": 0.5714285714285714,
                        "true_negative_rate": 0.125,
                        "false_positive_rate": 0.0875,
                        "false_negative_rate": 0.4375,
                        "true_positive_rate": 0.35
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 21,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.29375,
                        "precision": 0.8095238095238095,
                        "recall": 0.1349206349206349,
                        "f1": 0.23129251700680273,
                        "true_negative_rate": 0.1875,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.68125,
                        "true_positive_rate": 0.10625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5875,
                            "stdev": 0.21300491484939965
                        },
                        "precision": {
                            "average": 0.8146605741676165,
                            "stdev": 0.010464750108738784
                        },
                        "recall": {
                            "average": 0.611111111111111,
                            "stdev": 0.339558277226689
                        },
                        "f1": {
                            "average": 0.6382947092169831,
                            "stdev": 0.26570953886676796
                        },
                        "true_negative_rate": {
                            "average": 0.10625,
                            "stdev": 0.05466517401417469
                        },
                        "false_positive_rate": {
                            "average": 0.10625000000000001,
                            "stdev": 0.05466517401417469
                        },
                        "false_negative_rate": {
                            "average": 0.30625,
                            "stdev": 0.2674021433160176
                        },
                        "true_positive_rate": {
                            "average": 0.48125,
                            "stdev": 0.2674021433160176
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 37,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.41875,
                        "precision": 0.9459459459459459,
                        "recall": 0.2777777777777778,
                        "f1": 0.4294478527607362,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.56875,
                        "true_positive_rate": 0.21875
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 13,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.29375,
                        "precision": 1.0,
                        "recall": 0.10317460317460317,
                        "f1": 0.18705035971223022,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.70625,
                        "true_positive_rate": 0.08125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 8,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.2625,
                        "precision": 1.0,
                        "recall": 0.06349206349206349,
                        "f1": 0.11940298507462686,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.7375,
                        "true_positive_rate": 0.05
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 0,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.2125,
                        "precision": 0.0,
                        "recall": 0.0,
                        "f1": 0.0,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.7875,
                        "true_positive_rate": 0.0
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.296875,
                            "stdev": 0.07609872288678701
                        },
                        "precision": {
                            "average": 0.7364864864864865,
                            "stdev": 0.42578291227535536
                        },
                        "recall": {
                            "average": 0.1111111111111111,
                            "stdev": 0.10302186496624739
                        },
                        "f1": {
                            "average": 0.18397529938689833,
                            "stdev": 0.1567503788001944
                        },
                        "true_negative_rate": {
                            "average": 0.209375,
                            "stdev": 0.005412658773652735
                        },
                        "false_positive_rate": {
                            "average": 0.003125,
                            "stdev": 0.0054126587736527424
                        },
                        "false_negative_rate": {
                            "average": 0.7000000000000001,
                            "stdev": 0.08112971866091982
                        },
                        "true_positive_rate": {
                            "average": 0.0875,
                            "stdev": 0.08112971866091981
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 76,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.675,
                        "precision": 0.9868421052631579,
                        "recall": 0.5952380952380952,
                        "f1": 0.7425742574257426,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.31875,
                        "true_positive_rate": 0.46875
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 25,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.35625,
                        "precision": 0.96,
                        "recall": 0.19047619047619047,
                        "f1": 0.31788079470198677,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.6375,
                        "true_positive_rate": 0.15
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 22,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.35,
                        "precision": 1.0,
                        "recall": 0.1746031746031746,
                        "f1": 0.2972972972972973,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.65,
                        "true_positive_rate": 0.1375
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 1,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.21875,
                        "precision": 1.0,
                        "recall": 0.007936507936507936,
                        "f1": 0.015748031496062992,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.78125,
                        "true_positive_rate": 0.00625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4,
                            "stdev": 0.16799600069644516
                        },
                        "precision": {
                            "average": 0.9867105263157895,
                            "stdev": 0.016330108317862763
                        },
                        "recall": {
                            "average": 0.24206349206349204,
                            "stdev": 0.21607862523907906
                        },
                        "f1": {
                            "average": 0.34337509523027243,
                            "stdev": 0.2595537453064604
                        },
                        "true_negative_rate": {
                            "average": 0.209375,
                            "stdev": 0.0031250000000000028
                        },
                        "false_positive_rate": {
                            "average": 0.003125,
                            "stdev": 0.003125
                        },
                        "false_negative_rate": {
                            "average": 0.596875,
                            "stdev": 0.1701619173757748
                        },
                        "true_positive_rate": {
                            "average": 0.19062500000000002,
                            "stdev": 0.17016191737577477
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 94,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.575,
                        "precision": 0.8085106382978723,
                        "recall": 0.6031746031746031,
                        "f1": 0.6909090909090909,
                        "true_negative_rate": 0.1,
                        "false_positive_rate": 0.1125,
                        "false_negative_rate": 0.3125,
                        "true_positive_rate": 0.475
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 19,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.29375,
                        "precision": 0.8421052631578947,
                        "recall": 0.12698412698412698,
                        "f1": 0.2206896551724138,
                        "true_negative_rate": 0.19375,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.6875,
                        "true_positive_rate": 0.1
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 70,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.8285714285714286,
                        "recall": 0.4603174603174603,
                        "f1": 0.5918367346938775,
                        "true_negative_rate": 0.1375,
                        "false_positive_rate": 0.075,
                        "false_negative_rate": 0.425,
                        "true_positive_rate": 0.3625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 11,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.26875,
                        "precision": 0.9090909090909091,
                        "recall": 0.07936507936507936,
                        "f1": 0.145985401459854,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.725,
                        "true_positive_rate": 0.0625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.409375,
                            "stdev": 0.13113834536473304
                        },
                        "precision": {
                            "average": 0.8470695597795262,
                            "stdev": 0.03775004402438644
                        },
                        "recall": {
                            "average": 0.31746031746031744,
                            "stdev": 0.2208004403689453
                        },
                        "f1": {
                            "average": 0.4123552205588091,
                            "stdev": 0.23318149695303603
                        },
                        "true_negative_rate": {
                            "average": 0.159375,
                            "stdev": 0.042961647140210994
                        },
                        "false_positive_rate": {
                            "average": 0.053125,
                            "stdev": 0.042961647140211
                        },
                        "false_negative_rate": {
                            "average": 0.5375,
                            "stdev": 0.17388034679054443
                        },
                        "true_positive_rate": {
                            "average": 0.25,
                            "stdev": 0.17388034679054443
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 57,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.55625,
                        "precision": 0.9824561403508771,
                        "recall": 0.4444444444444444,
                        "f1": 0.6120218579234973,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.4375,
                        "true_positive_rate": 0.35
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 21,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.34375,
                        "precision": 1.0,
                        "recall": 0.16666666666666666,
                        "f1": 0.2857142857142857,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.65625,
                        "true_positive_rate": 0.13125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 77,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.68125,
                        "precision": 0.987012987012987,
                        "recall": 0.6031746031746031,
                        "f1": 0.7487684729064039,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.3125,
                        "true_positive_rate": 0.475
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 11,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.28125,
                        "precision": 1.0,
                        "recall": 0.0873015873015873,
                        "f1": 0.16058394160583941,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.71875,
                        "true_positive_rate": 0.06875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.465625,
                            "stdev": 0.16089957077320002
                        },
                        "precision": {
                            "average": 0.992367281840966,
                            "stdev": 0.007800896931314664
                        },
                        "recall": {
                            "average": 0.32539682539682535,
                            "stdev": 0.20809699368455387
                        },
                        "f1": {
                            "average": 0.4517721395375066,
                            "stdev": 0.23783007153660388
                        },
                        "true_negative_rate": {
                            "average": 0.209375,
                            "stdev": 0.0031250000000000028
                        },
                        "false_positive_rate": {
                            "average": 0.003125,
                            "stdev": 0.003125
                        },
                        "false_negative_rate": {
                            "average": 0.53125,
                            "stdev": 0.16387638252658618
                        },
                        "true_positive_rate": {
                            "average": 0.25625,
                            "stdev": 0.16387638252658618
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 44,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.4125,
                        "precision": 0.8636363636363636,
                        "recall": 0.30158730158730157,
                        "f1": 0.4470588235294118,
                        "true_negative_rate": 0.175,
                        "false_positive_rate": 0.0375,
                        "false_negative_rate": 0.55,
                        "true_positive_rate": 0.2375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 50,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.4625,
                        "precision": 0.9,
                        "recall": 0.35714285714285715,
                        "f1": 0.5113636363636364,
                        "true_negative_rate": 0.18125,
                        "false_positive_rate": 0.03125,
                        "false_negative_rate": 0.50625,
                        "true_positive_rate": 0.28125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 36,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.375,
                        "precision": 0.8611111111111112,
                        "recall": 0.24603174603174602,
                        "f1": 0.38271604938271603,
                        "true_negative_rate": 0.18125,
                        "false_positive_rate": 0.03125,
                        "false_negative_rate": 0.59375,
                        "true_positive_rate": 0.19375
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 17,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.26875,
                        "precision": 0.7647058823529411,
                        "recall": 0.10317460317460317,
                        "f1": 0.18181818181818182,
                        "true_negative_rate": 0.1875,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.70625,
                        "true_positive_rate": 0.08125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3796875,
                            "stdev": 0.07117526233706485
                        },
                        "precision": {
                            "average": 0.847363339275104,
                            "stdev": 0.05014151421084151
                        },
                        "recall": {
                            "average": 0.251984126984127,
                            "stdev": 0.09447028458029692
                        },
                        "f1": {
                            "average": 0.3807391727734865,
                            "stdev": 0.12352582245058198
                        },
                        "true_negative_rate": {
                            "average": 0.18125,
                            "stdev": 0.004419417382415926
                        },
                        "false_positive_rate": {
                            "average": 0.03125,
                            "stdev": 0.004419417382415921
                        },
                        "false_negative_rate": {
                            "average": 0.5890625,
                            "stdev": 0.07439534910698385
                        },
                        "true_positive_rate": {
                            "average": 0.19843750000000002,
                            "stdev": 0.07439534910698384
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 79,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.63125,
                        "precision": 0.9240506329113924,
                        "recall": 0.5793650793650794,
                        "f1": 0.7121951219512195,
                        "true_negative_rate": 0.175,
                        "false_positive_rate": 0.0375,
                        "false_negative_rate": 0.33125,
                        "true_positive_rate": 0.45625
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 88,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.6875,
                        "precision": 0.9318181818181818,
                        "recall": 0.6507936507936508,
                        "f1": 0.7663551401869159,
                        "true_negative_rate": 0.175,
                        "false_positive_rate": 0.0375,
                        "false_negative_rate": 0.275,
                        "true_positive_rate": 0.5125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 69,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.59375,
                        "precision": 0.9420289855072463,
                        "recall": 0.5158730158730159,
                        "f1": 0.6666666666666666,
                        "true_negative_rate": 0.1875,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.38125,
                        "true_positive_rate": 0.40625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 58,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.896551724137931,
                        "recall": 0.4126984126984127,
                        "f1": 0.5652173913043478,
                        "true_negative_rate": 0.175,
                        "false_positive_rate": 0.0375,
                        "false_negative_rate": 0.4625,
                        "true_positive_rate": 0.325
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.603125,
                            "stdev": 0.06825103021200486
                        },
                        "precision": {
                            "average": 0.9236123810936879,
                            "stdev": 0.01687437186377968
                        },
                        "recall": {
                            "average": 0.5396825396825397,
                            "stdev": 0.08748177652797066
                        },
                        "f1": {
                            "average": 0.6776085800272874,
                            "stdev": 0.07386420918308342
                        },
                        "true_negative_rate": {
                            "average": 0.17812499999999998,
                            "stdev": 0.005412658773652747
                        },
                        "false_positive_rate": {
                            "average": 0.034375,
                            "stdev": 0.00541265877365274
                        },
                        "false_negative_rate": {
                            "average": 0.3625,
                            "stdev": 0.06889189901577689
                        },
                        "true_positive_rate": {
                            "average": 0.425,
                            "stdev": 0.06889189901577686
                        }
                    }
                }
            }
        }
    },
    "answerability_classification": {
        "initial_model=gpt-4-0613": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 94,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.6063829787234043,
                        "recall": 0.7037037037037037,
                        "f1": 0.6514285714285715,
                        "true_negative_rate": 0.15714285714285714,
                        "false_positive_rate": 0.2642857142857143,
                        "false_negative_rate": 0.17142857142857143,
                        "true_positive_rate": 0.40714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 90,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5357142857142857,
                        "precision": 0.5888888888888889,
                        "recall": 0.654320987654321,
                        "f1": 0.6198830409356725,
                        "true_negative_rate": 0.15714285714285714,
                        "false_positive_rate": 0.2642857142857143,
                        "false_negative_rate": 0.2,
                        "true_positive_rate": 0.37857142857142856
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 92,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5214285714285715,
                        "precision": 0.5760869565217391,
                        "recall": 0.654320987654321,
                        "f1": 0.6127167630057804,
                        "true_negative_rate": 0.14285714285714285,
                        "false_positive_rate": 0.2785714285714286,
                        "false_negative_rate": 0.2,
                        "true_positive_rate": 0.37857142857142856
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 44,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.6136363636363636,
                        "recall": 0.3333333333333333,
                        "f1": 0.432,
                        "true_negative_rate": 0.3,
                        "false_positive_rate": 0.12142857142857143,
                        "false_negative_rate": 0.38571428571428573,
                        "true_positive_rate": 0.19285714285714287
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.525,
                            "stdev": 0.021128856368212916
                        },
                        "precision": {
                            "average": 0.596248796942599,
                            "stdev": 0.014711366112735802
                        },
                        "recall": {
                            "average": 0.5864197530864198,
                            "stdev": 0.14750374253516693
                        },
                        "f1": {
                            "average": 0.5790070938425061,
                            "stdev": 0.08611495355283702
                        },
                        "true_negative_rate": {
                            "average": 0.18928571428571428,
                            "stdev": 0.06418643127004081
                        },
                        "false_positive_rate": {
                            "average": 0.23214285714285715,
                            "stdev": 0.06418643127004083
                        },
                        "false_negative_rate": {
                            "average": 0.23928571428571427,
                            "stdev": 0.08534145103820372
                        },
                        "true_positive_rate": {
                            "average": 0.33928571428571425,
                            "stdev": 0.08534145103820372
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 137,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5857142857142857,
                        "precision": 0.583941605839416,
                        "recall": 0.9876543209876543,
                        "f1": 0.7339449541284404,
                        "true_negative_rate": 0.014285714285714285,
                        "false_positive_rate": 0.40714285714285714,
                        "false_negative_rate": 0.007142857142857143,
                        "true_positive_rate": 0.5714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 94,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.5851063829787234,
                        "recall": 0.6790123456790124,
                        "f1": 0.6285714285714286,
                        "true_negative_rate": 0.14285714285714285,
                        "false_positive_rate": 0.2785714285714286,
                        "false_negative_rate": 0.18571428571428572,
                        "true_positive_rate": 0.39285714285714285
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 135,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.6,
                        "precision": 0.5925925925925926,
                        "recall": 0.9876543209876543,
                        "f1": 0.7407407407407407,
                        "true_negative_rate": 0.02857142857142857,
                        "false_positive_rate": 0.39285714285714285,
                        "false_negative_rate": 0.007142857142857143,
                        "true_positive_rate": 0.5714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 81,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4785714285714286,
                        "precision": 0.5555555555555556,
                        "recall": 0.5555555555555556,
                        "f1": 0.5555555555555556,
                        "true_negative_rate": 0.16428571428571428,
                        "false_positive_rate": 0.2571428571428571,
                        "false_negative_rate": 0.2571428571428571,
                        "true_positive_rate": 0.32142857142857145
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5410714285714286,
                            "stdev": 0.05258011380119806
                        },
                        "precision": {
                            "average": 0.5792990342415718,
                            "stdev": 0.01410452271561596
                        },
                        "recall": {
                            "average": 0.8024691358024691,
                            "stdev": 0.19025969144965973
                        },
                        "f1": {
                            "average": 0.6647031697490413,
                            "stdev": 0.07712788294614491
                        },
                        "true_negative_rate": {
                            "average": 0.0875,
                            "stdev": 0.06669589070351106
                        },
                        "false_positive_rate": {
                            "average": 0.3339285714285714,
                            "stdev": 0.06669589070351106
                        },
                        "false_negative_rate": {
                            "average": 0.11428571428571428,
                            "stdev": 0.11007882148158886
                        },
                        "true_positive_rate": {
                            "average": 0.46428571428571425,
                            "stdev": 0.11007882148158885
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 133,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5571428571428572,
                        "precision": 0.5714285714285714,
                        "recall": 0.9382716049382716,
                        "f1": 0.7102803738317757,
                        "true_negative_rate": 0.014285714285714285,
                        "false_positive_rate": 0.40714285714285714,
                        "false_negative_rate": 0.03571428571428571,
                        "true_positive_rate": 0.5428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 120,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5357142857142857,
                        "precision": 0.5666666666666667,
                        "recall": 0.8395061728395061,
                        "f1": 0.6766169154228856,
                        "true_negative_rate": 0.05,
                        "false_positive_rate": 0.37142857142857144,
                        "false_negative_rate": 0.09285714285714286,
                        "true_positive_rate": 0.4857142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 128,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5642857142857143,
                        "precision": 0.578125,
                        "recall": 0.9135802469135802,
                        "f1": 0.7081339712918661,
                        "true_negative_rate": 0.03571428571428571,
                        "false_positive_rate": 0.38571428571428573,
                        "false_negative_rate": 0.05,
                        "true_positive_rate": 0.5285714285714286
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 56,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.6607142857142857,
                        "recall": 0.4567901234567901,
                        "f1": 0.5401459854014599,
                        "true_negative_rate": 0.2857142857142857,
                        "false_positive_rate": 0.1357142857142857,
                        "false_negative_rate": 0.3142857142857143,
                        "true_positive_rate": 0.2642857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5517857142857143,
                            "stdev": 0.010564428184106462
                        },
                        "precision": {
                            "average": 0.5942336309523809,
                            "stdev": 0.03859784226011189
                        },
                        "recall": {
                            "average": 0.787037037037037,
                            "stdev": 0.1941012059695077
                        },
                        "f1": {
                            "average": 0.6587943114869969,
                            "stdev": 0.06978589961336415
                        },
                        "true_negative_rate": {
                            "average": 0.09642857142857142,
                            "stdev": 0.11002087000535259
                        },
                        "false_positive_rate": {
                            "average": 0.325,
                            "stdev": 0.1100208700053526
                        },
                        "false_negative_rate": {
                            "average": 0.1232142857142857,
                            "stdev": 0.11230141202521517
                        },
                        "true_positive_rate": {
                            "average": 0.4553571428571428,
                            "stdev": 0.11230141202521517
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 58,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4642857142857143,
                        "precision": 0.5689655172413793,
                        "recall": 0.4074074074074074,
                        "f1": 0.4748201438848921,
                        "true_negative_rate": 0.24285714285714285,
                        "false_positive_rate": 0.17857142857142858,
                        "false_negative_rate": 0.34285714285714286,
                        "true_positive_rate": 0.2357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 55,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.6,
                        "recall": 0.4074074074074074,
                        "f1": 0.4852941176470588,
                        "true_negative_rate": 0.2642857142857143,
                        "false_positive_rate": 0.15714285714285714,
                        "false_negative_rate": 0.34285714285714286,
                        "true_positive_rate": 0.2357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 38,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.45,
                        "precision": 0.5789473684210527,
                        "recall": 0.2716049382716049,
                        "f1": 0.3697478991596639,
                        "true_negative_rate": 0.30714285714285716,
                        "false_positive_rate": 0.11428571428571428,
                        "false_negative_rate": 0.42142857142857143,
                        "true_positive_rate": 0.15714285714285714
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 18,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4357142857142857,
                        "precision": 0.6666666666666666,
                        "recall": 0.14814814814814814,
                        "f1": 0.24242424242424243,
                        "true_negative_rate": 0.37857142857142856,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.4928571428571429,
                        "true_positive_rate": 0.08571428571428572
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.46071428571428574,
                            "stdev": 0.02112885636821292
                        },
                        "precision": {
                            "average": 0.6036448880822747,
                            "stdev": 0.03807117027492464
                        },
                        "recall": {
                            "average": 0.30864197530864196,
                            "stdev": 0.10798059064528333
                        },
                        "f1": {
                            "average": 0.3930716007789643,
                            "stdev": 0.09801331357768425
                        },
                        "true_negative_rate": {
                            "average": 0.2982142857142857,
                            "stdev": 0.051847254074849906
                        },
                        "false_positive_rate": {
                            "average": 0.12321428571428572,
                            "stdev": 0.05184725407484991
                        },
                        "false_negative_rate": {
                            "average": 0.4,
                            "stdev": 0.06247448458762823
                        },
                        "true_positive_rate": {
                            "average": 0.17857142857142858,
                            "stdev": 0.06247448458762822
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 126,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.5714285714285714,
                        "recall": 0.8888888888888888,
                        "f1": 0.6956521739130435,
                        "true_negative_rate": 0.03571428571428571,
                        "false_positive_rate": 0.38571428571428573,
                        "false_negative_rate": 0.06428571428571428,
                        "true_positive_rate": 0.5142857142857142
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 104,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5642857142857143,
                        "precision": 0.5961538461538461,
                        "recall": 0.7654320987654321,
                        "f1": 0.6702702702702703,
                        "true_negative_rate": 0.12142857142857143,
                        "false_positive_rate": 0.3,
                        "false_negative_rate": 0.1357142857142857,
                        "true_positive_rate": 0.44285714285714284
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 45,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5285714285714286,
                        "precision": 0.6666666666666666,
                        "recall": 0.37037037037037035,
                        "f1": 0.47619047619047616,
                        "true_negative_rate": 0.3142857142857143,
                        "false_positive_rate": 0.10714285714285714,
                        "false_negative_rate": 0.36428571428571427,
                        "true_positive_rate": 0.21428571428571427
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 17,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.7647058823529411,
                        "recall": 0.16049382716049382,
                        "f1": 0.2653061224489796,
                        "true_negative_rate": 0.39285714285714285,
                        "false_positive_rate": 0.02857142857142857,
                        "false_negative_rate": 0.4857142857142857,
                        "true_positive_rate": 0.09285714285714286
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5321428571428571,
                            "stdev": 0.02966651379613599
                        },
                        "precision": {
                            "average": 0.6497387416505063,
                            "stdev": 0.07501303532090707
                        },
                        "recall": {
                            "average": 0.5462962962962962,
                            "stdev": 0.2937616640353547
                        },
                        "f1": {
                            "average": 0.5268547607056924,
                            "stdev": 0.17323043539498215
                        },
                        "true_negative_rate": {
                            "average": 0.21607142857142858,
                            "stdev": 0.14351411435955755
                        },
                        "false_positive_rate": {
                            "average": 0.20535714285714285,
                            "stdev": 0.14351411435955755
                        },
                        "false_negative_rate": {
                            "average": 0.2625,
                            "stdev": 0.16996210562045522
                        },
                        "true_positive_rate": {
                            "average": 0.31607142857142856,
                            "stdev": 0.1699621056204552
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 20,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4785714285714286,
                        "precision": 0.7,
                        "recall": 0.1728395061728395,
                        "f1": 0.27722772277227725,
                        "true_negative_rate": 0.37857142857142856,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.4785714285714286,
                        "true_positive_rate": 0.1
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 7,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.44285714285714284,
                        "precision": 0.7142857142857143,
                        "recall": 0.06172839506172839,
                        "f1": 0.11363636363636363,
                        "true_negative_rate": 0.40714285714285714,
                        "false_positive_rate": 0.014285714285714285,
                        "false_negative_rate": 0.5428571428571428,
                        "true_positive_rate": 0.03571428571428571
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 16,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.8125,
                        "recall": 0.16049382716049382,
                        "f1": 0.26804123711340205,
                        "true_negative_rate": 0.4,
                        "false_positive_rate": 0.02142857142857143,
                        "false_negative_rate": 0.4857142857142857,
                        "true_positive_rate": 0.09285714285714286
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 0,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.42142857142857143,
                        "precision": 0.0,
                        "recall": 0.0,
                        "f1": 0.0,
                        "true_negative_rate": 0.42142857142857143,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.5785714285714286,
                        "true_positive_rate": 0.0
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.45892857142857146,
                            "stdev": 0.028291034853133692
                        },
                        "precision": {
                            "average": 0.5566964285714285,
                            "stdev": 0.3243133813520182
                        },
                        "recall": {
                            "average": 0.09876543209876543,
                            "stdev": 0.07145578335055694
                        },
                        "f1": {
                            "average": 0.16472633088051075,
                            "stdev": 0.11519058323693335
                        },
                        "true_negative_rate": {
                            "average": 0.4017857142857143,
                            "stdev": 0.015464739353293552
                        },
                        "false_positive_rate": {
                            "average": 0.019642857142857142,
                            "stdev": 0.015464739353293547
                        },
                        "false_negative_rate": {
                            "average": 0.5214285714285715,
                            "stdev": 0.041342274652822246
                        },
                        "true_positive_rate": {
                            "average": 0.05714285714285715,
                            "stdev": 0.04134227465282223
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 41,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 0.5853658536585366,
                        "recall": 0.2962962962962963,
                        "f1": 0.39344262295081966,
                        "true_negative_rate": 0.3,
                        "false_positive_rate": 0.12142857142857143,
                        "false_negative_rate": 0.40714285714285714,
                        "true_positive_rate": 0.17142857142857143
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 36,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.6388888888888888,
                        "recall": 0.2839506172839506,
                        "f1": 0.39316239316239315,
                        "true_negative_rate": 0.32857142857142857,
                        "false_positive_rate": 0.09285714285714286,
                        "false_negative_rate": 0.4142857142857143,
                        "true_positive_rate": 0.16428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 17,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.7647058823529411,
                        "recall": 0.16049382716049382,
                        "f1": 0.2653061224489796,
                        "true_negative_rate": 0.39285714285714285,
                        "false_positive_rate": 0.02857142857142857,
                        "false_negative_rate": 0.4857142857142857,
                        "true_positive_rate": 0.09285714285714286
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 2,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.42142857142857143,
                        "precision": 0.5,
                        "recall": 0.012345679012345678,
                        "f1": 0.024096385542168676,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5714285714285714,
                        "true_positive_rate": 0.007142857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.46785714285714286,
                            "stdev": 0.02789374884252377
                        },
                        "precision": {
                            "average": 0.6222401562250917,
                            "stdev": 0.09601566721654163
                        },
                        "recall": {
                            "average": 0.1882716049382716,
                            "stdev": 0.11461385572609346
                        },
                        "f1": {
                            "average": 0.2690018810260903,
                            "stdev": 0.15074290312848193
                        },
                        "true_negative_rate": {
                            "average": 0.35892857142857143,
                            "stdev": 0.04639421805988065
                        },
                        "false_positive_rate": {
                            "average": 0.0625,
                            "stdev": 0.04639421805988064
                        },
                        "false_negative_rate": {
                            "average": 0.46964285714285714,
                            "stdev": 0.06631230224152548
                        },
                        "true_positive_rate": {
                            "average": 0.10892857142857143,
                            "stdev": 0.0663123022415255
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 66,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.6214285714285714,
                        "precision": 0.7121212121212122,
                        "recall": 0.5802469135802469,
                        "f1": 0.6394557823129252,
                        "true_negative_rate": 0.2857142857142857,
                        "false_positive_rate": 0.1357142857142857,
                        "false_negative_rate": 0.24285714285714285,
                        "true_positive_rate": 0.3357142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 13,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 0.7692307692307693,
                        "recall": 0.12345679012345678,
                        "f1": 0.2127659574468085,
                        "true_negative_rate": 0.4,
                        "false_positive_rate": 0.02142857142857143,
                        "false_negative_rate": 0.5071428571428571,
                        "true_positive_rate": 0.07142857142857142
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 57,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5428571428571428,
                        "precision": 0.6491228070175439,
                        "recall": 0.4567901234567901,
                        "f1": 0.5362318840579711,
                        "true_negative_rate": 0.2785714285714286,
                        "false_positive_rate": 0.14285714285714285,
                        "false_negative_rate": 0.3142857142857143,
                        "true_positive_rate": 0.2642857142857143
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 11,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 0.8181818181818182,
                        "recall": 0.1111111111111111,
                        "f1": 0.1956521739130435,
                        "true_negative_rate": 0.40714285714285714,
                        "false_positive_rate": 0.014285714285714285,
                        "false_negative_rate": 0.5142857142857142,
                        "true_positive_rate": 0.06428571428571428
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5267857142857143,
                            "stdev": 0.061936232842945096
                        },
                        "precision": {
                            "average": 0.737164151637836,
                            "stdev": 0.06318733543651298
                        },
                        "recall": {
                            "average": 0.31790123456790126,
                            "stdev": 0.2053571132733305
                        },
                        "f1": {
                            "average": 0.39602644943268706,
                            "stdev": 0.19535202981919006
                        },
                        "true_negative_rate": {
                            "average": 0.34285714285714286,
                            "stdev": 0.06081923702116572
                        },
                        "false_positive_rate": {
                            "average": 0.07857142857142857,
                            "stdev": 0.06081923702116571
                        },
                        "false_negative_rate": {
                            "average": 0.39464285714285713,
                            "stdev": 0.11881375839385548
                        },
                        "true_positive_rate": {
                            "average": 0.18392857142857144,
                            "stdev": 0.1188137583938555
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 40,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5071428571428571,
                        "precision": 0.65,
                        "recall": 0.32098765432098764,
                        "f1": 0.4297520661157025,
                        "true_negative_rate": 0.32142857142857145,
                        "false_positive_rate": 0.1,
                        "false_negative_rate": 0.39285714285714285,
                        "true_positive_rate": 0.18571428571428572
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 24,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5071428571428571,
                        "precision": 0.75,
                        "recall": 0.2222222222222222,
                        "f1": 0.34285714285714286,
                        "true_negative_rate": 0.37857142857142856,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.45,
                        "true_positive_rate": 0.12857142857142856
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 40,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5357142857142857,
                        "precision": 0.7,
                        "recall": 0.345679012345679,
                        "f1": 0.4628099173553719,
                        "true_negative_rate": 0.3357142857142857,
                        "false_positive_rate": 0.08571428571428572,
                        "false_negative_rate": 0.37857142857142856,
                        "true_positive_rate": 0.2
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 21,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.7142857142857143,
                        "recall": 0.18518518518518517,
                        "f1": 0.29411764705882354,
                        "true_negative_rate": 0.37857142857142856,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.4714285714285714,
                        "true_positive_rate": 0.10714285714285714
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5089285714285714,
                            "stdev": 0.01776763280547535
                        },
                        "precision": {
                            "average": 0.7035714285714285,
                            "stdev": 0.0358924129325746
                        },
                        "recall": {
                            "average": 0.2685185185185185,
                            "stdev": 0.0666980950153528
                        },
                        "f1": {
                            "average": 0.3823841933467602,
                            "stdev": 0.06720376359805548
                        },
                        "true_negative_rate": {
                            "average": 0.35357142857142854,
                            "stdev": 0.02550510153051017
                        },
                        "false_positive_rate": {
                            "average": 0.06785714285714285,
                            "stdev": 0.02550510153051018
                        },
                        "false_negative_rate": {
                            "average": 0.42321428571428577,
                            "stdev": 0.03858961211602556
                        },
                        "true_positive_rate": {
                            "average": 0.15535714285714283,
                            "stdev": 0.03858961211602556
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 13,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.9230769230769231,
                        "recall": 0.14814814814814814,
                        "f1": 0.2553191489361702,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.4928571428571429,
                        "true_positive_rate": 0.08571428571428572
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 13,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.9230769230769231,
                        "recall": 0.14814814814814814,
                        "f1": 0.2553191489361702,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.4928571428571429,
                        "true_positive_rate": 0.08571428571428572
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 10,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4785714285714286,
                        "precision": 0.9,
                        "recall": 0.1111111111111111,
                        "f1": 0.1978021978021978,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5142857142857142,
                        "true_positive_rate": 0.06428571428571428
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 8,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4642857142857143,
                        "precision": 0.875,
                        "recall": 0.08641975308641975,
                        "f1": 0.15730337078651685,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5285714285714286,
                        "true_positive_rate": 0.05
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.48571428571428577,
                            "stdev": 0.015152288168283153
                        },
                        "precision": {
                            "average": 0.9052884615384615,
                            "stdev": 0.01986339255780156
                        },
                        "recall": {
                            "average": 0.12345679012345678,
                            "stdev": 0.0261891400439462
                        },
                        "f1": {
                            "average": 0.21643596661526376,
                            "stdev": 0.04143574834737038
                        },
                        "true_negative_rate": {
                            "average": 0.4142857142857143,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.007142857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5071428571428571,
                            "stdev": 0.015152288168283146
                        },
                        "true_positive_rate": {
                            "average": 0.07142857142857142,
                            "stdev": 0.015152288168283162
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 140,
                    "prediction_error_num": 13,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.9230769230769231,
                        "recall": 0.14814814814814814,
                        "f1": 0.2553191489361702,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.4928571428571429,
                        "true_positive_rate": 0.08571428571428572
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 140,
                    "prediction_error_num": 12,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.9166666666666666,
                        "recall": 0.13580246913580246,
                        "f1": 0.23655913978494625,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5,
                        "true_positive_rate": 0.07857142857142857
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 140,
                    "prediction_error_num": 11,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.9090909090909091,
                        "recall": 0.12345679012345678,
                        "f1": 0.21739130434782608,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5071428571428571,
                        "true_positive_rate": 0.07142857142857142
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 140,
                    "prediction_error_num": 12,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4928571428571429,
                        "precision": 0.9166666666666666,
                        "recall": 0.13580246913580246,
                        "f1": 0.23655913978494625,
                        "true_negative_rate": 0.4142857142857143,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5,
                        "true_positive_rate": 0.07857142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4928571428571429,
                            "stdev": 0.005050762722761055
                        },
                        "precision": {
                            "average": 0.9163752913752914,
                            "stdev": 0.004953379953379981
                        },
                        "recall": {
                            "average": 0.13580246913580246,
                            "stdev": 0.008729713347982067
                        },
                        "f1": {
                            "average": 0.2364571832134722,
                            "stdev": 0.01340990564963694
                        },
                        "true_negative_rate": {
                            "average": 0.4142857142857143,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.007142857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5,
                            "stdev": 0.005050762722761035
                        },
                        "true_positive_rate": {
                            "average": 0.07857142857142857,
                            "stdev": 0.005050762722761055
                        }
                    }
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 49,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.4,
                        "precision": 0.7959183673469388,
                        "recall": 0.31451612903225806,
                        "f1": 0.4508670520231214,
                        "true_negative_rate": 0.1625,
                        "false_positive_rate": 0.0625,
                        "false_negative_rate": 0.53125,
                        "true_positive_rate": 0.24375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 46,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.375,
                        "precision": 0.7608695652173914,
                        "recall": 0.28225806451612906,
                        "f1": 0.4117647058823529,
                        "true_negative_rate": 0.15625,
                        "false_positive_rate": 0.06875,
                        "false_negative_rate": 0.55625,
                        "true_positive_rate": 0.21875
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 47,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.40625,
                        "precision": 0.8085106382978723,
                        "recall": 0.3064516129032258,
                        "f1": 0.4444444444444444,
                        "true_negative_rate": 0.16875,
                        "false_positive_rate": 0.05625,
                        "false_negative_rate": 0.5375,
                        "true_positive_rate": 0.2375
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 11,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.25625,
                        "precision": 0.7272727272727273,
                        "recall": 0.06451612903225806,
                        "f1": 0.11851851851851852,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.725,
                        "true_positive_rate": 0.05
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.359375,
                            "stdev": 0.06067652449671126
                        },
                        "precision": {
                            "average": 0.7731428245337324,
                            "stdev": 0.031718710242817645
                        },
                        "recall": {
                            "average": 0.24193548387096775,
                            "stdev": 0.10311864397213244
                        },
                        "f1": {
                            "average": 0.3563986802171093,
                            "stdev": 0.13813824415939865
                        },
                        "true_negative_rate": {
                            "average": 0.1734375,
                            "stdev": 0.019452968121857386
                        },
                        "false_positive_rate": {
                            "average": 0.0515625,
                            "stdev": 0.019452968121857393
                        },
                        "false_negative_rate": {
                            "average": 0.5875,
                            "stdev": 0.07991694907840263
                        },
                        "true_positive_rate": {
                            "average": 0.1875,
                            "stdev": 0.07991694907840263
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 154,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.775,
                        "precision": 0.7857142857142857,
                        "recall": 0.9758064516129032,
                        "f1": 0.8705035971223022,
                        "true_negative_rate": 0.01875,
                        "false_positive_rate": 0.20625,
                        "false_negative_rate": 0.01875,
                        "true_positive_rate": 0.75625
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 63,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.45,
                        "precision": 0.7936507936507936,
                        "recall": 0.4032258064516129,
                        "f1": 0.5347593582887701,
                        "true_negative_rate": 0.14375,
                        "false_positive_rate": 0.08125,
                        "false_negative_rate": 0.4625,
                        "true_positive_rate": 0.3125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 153,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.74375,
                        "precision": 0.7712418300653595,
                        "recall": 0.9516129032258065,
                        "f1": 0.851985559566787,
                        "true_negative_rate": 0.00625,
                        "false_positive_rate": 0.21875,
                        "false_negative_rate": 0.0375,
                        "true_positive_rate": 0.7375
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 123,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.69375,
                        "precision": 0.8048780487804879,
                        "recall": 0.7983870967741935,
                        "f1": 0.8016194331983806,
                        "true_negative_rate": 0.075,
                        "false_positive_rate": 0.15,
                        "false_negative_rate": 0.15625,
                        "true_positive_rate": 0.61875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.665625,
                            "stdev": 0.12781975835135975
                        },
                        "precision": {
                            "average": 0.7888712395527318,
                            "stdev": 0.012245657551164897
                        },
                        "recall": {
                            "average": 0.782258064516129,
                            "stdev": 0.22916568126423909
                        },
                        "f1": {
                            "average": 0.76471698704406,
                            "stdev": 0.13513781130472774
                        },
                        "true_negative_rate": {
                            "average": 0.06093749999999999,
                            "stdev": 0.05437410200407911
                        },
                        "false_positive_rate": {
                            "average": 0.1640625,
                            "stdev": 0.05437410200407911
                        },
                        "false_negative_rate": {
                            "average": 0.16875,
                            "stdev": 0.17760340297978527
                        },
                        "true_positive_rate": {
                            "average": 0.6062500000000001,
                            "stdev": 0.17760340297978527
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 126,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.7125,
                        "precision": 0.8095238095238095,
                        "recall": 0.8225806451612904,
                        "f1": 0.816,
                        "true_negative_rate": 0.075,
                        "false_positive_rate": 0.15,
                        "false_negative_rate": 0.1375,
                        "true_positive_rate": 0.6375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 34,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.35,
                        "precision": 0.7941176470588235,
                        "recall": 0.21774193548387097,
                        "f1": 0.34177215189873417,
                        "true_negative_rate": 0.18125,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.60625,
                        "true_positive_rate": 0.16875
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 118,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.725,
                        "precision": 0.8389830508474576,
                        "recall": 0.7983870967741935,
                        "f1": 0.8181818181818182,
                        "true_negative_rate": 0.10625,
                        "false_positive_rate": 0.11875,
                        "false_negative_rate": 0.15625,
                        "true_positive_rate": 0.61875
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 4,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.225,
                        "precision": 0.5,
                        "recall": 0.016129032258064516,
                        "f1": 0.03125,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.7625,
                        "true_positive_rate": 0.0125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.503125,
                            "stdev": 0.22015175873701306
                        },
                        "precision": {
                            "average": 0.7356561268575227,
                            "stdev": 0.13700770711299873
                        },
                        "recall": {
                            "average": 0.4637096774193548,
                            "stdev": 0.354127770200914
                        },
                        "f1": {
                            "average": 0.5018009925201381,
                            "stdev": 0.3338582444466568
                        },
                        "true_negative_rate": {
                            "average": 0.14375,
                            "stdev": 0.05537514108334172
                        },
                        "false_positive_rate": {
                            "average": 0.08125,
                            "stdev": 0.05537514108334172
                        },
                        "false_negative_rate": {
                            "average": 0.41562499999999997,
                            "stdev": 0.27444902190570836
                        },
                        "true_positive_rate": {
                            "average": 0.35937499999999994,
                            "stdev": 0.27444902190570836
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 43,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.36875,
                        "precision": 0.7906976744186046,
                        "recall": 0.27419354838709675,
                        "f1": 0.40718562874251496,
                        "true_negative_rate": 0.16875,
                        "false_positive_rate": 0.05625,
                        "false_negative_rate": 0.5625,
                        "true_positive_rate": 0.2125
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 41,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.34375,
                        "precision": 0.7560975609756098,
                        "recall": 0.25,
                        "f1": 0.37575757575757573,
                        "true_negative_rate": 0.1625,
                        "false_positive_rate": 0.0625,
                        "false_negative_rate": 0.58125,
                        "true_positive_rate": 0.19375
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 44,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.36875,
                        "precision": 0.7954545454545454,
                        "recall": 0.28225806451612906,
                        "f1": 0.4166666666666667,
                        "true_negative_rate": 0.16875,
                        "false_positive_rate": 0.05625,
                        "false_negative_rate": 0.55625,
                        "true_positive_rate": 0.21875
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 2,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.20625,
                        "precision": 1.0,
                        "recall": 0.016129032258064516,
                        "f1": 0.031746031746031744,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.7625,
                        "true_positive_rate": 0.0125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.321875,
                            "stdev": 0.06753182120304473
                        },
                        "precision": {
                            "average": 0.8355624452121899,
                            "stdev": 0.09614555181163782
                        },
                        "recall": {
                            "average": 0.20564516129032256,
                            "stdev": 0.11005922632222727
                        },
                        "f1": {
                            "average": 0.3078389757281973,
                            "stdev": 0.1601198564566838
                        },
                        "true_negative_rate": {
                            "average": 0.18125,
                            "stdev": 0.025387620014487376
                        },
                        "false_positive_rate": {
                            "average": 0.04375,
                            "stdev": 0.025387620014487376
                        },
                        "false_negative_rate": {
                            "average": 0.6156250000000001,
                            "stdev": 0.08529590039972611
                        },
                        "true_positive_rate": {
                            "average": 0.159375,
                            "stdev": 0.08529590039972612
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 150,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.7625,
                        "precision": 0.7866666666666666,
                        "recall": 0.9516129032258065,
                        "f1": 0.8613138686131386,
                        "true_negative_rate": 0.025,
                        "false_positive_rate": 0.2,
                        "false_negative_rate": 0.0375,
                        "true_positive_rate": 0.7375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 126,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.66875,
                        "precision": 0.7857142857142857,
                        "recall": 0.7983870967741935,
                        "f1": 0.792,
                        "true_negative_rate": 0.05625,
                        "false_positive_rate": 0.16875,
                        "false_negative_rate": 0.15625,
                        "true_positive_rate": 0.61875
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 6,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.2375,
                        "precision": 0.6666666666666666,
                        "recall": 0.03225806451612903,
                        "f1": 0.06153846153846154,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.75,
                        "true_positive_rate": 0.025
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 3,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.23125,
                        "precision": 0.6666666666666666,
                        "recall": 0.016129032258064516,
                        "f1": 0.031496062992125984,
                        "true_negative_rate": 0.21875,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.7625,
                        "true_positive_rate": 0.0125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.475,
                            "stdev": 0.24290719678510966
                        },
                        "precision": {
                            "average": 0.7264285714285713,
                            "stdev": 0.0597628533409825
                        },
                        "recall": {
                            "average": 0.4495967741935484,
                            "stdev": 0.428876661998617
                        },
                        "f1": {
                            "average": 0.4365870982859315,
                            "stdev": 0.39098317960589335
                        },
                        "true_negative_rate": {
                            "average": 0.128125,
                            "stdev": 0.08822246383433191
                        },
                        "false_positive_rate": {
                            "average": 0.096875,
                            "stdev": 0.08822246383433191
                        },
                        "false_negative_rate": {
                            "average": 0.42656249999999996,
                            "stdev": 0.33237941304892815
                        },
                        "true_positive_rate": {
                            "average": 0.3484375,
                            "stdev": 0.3323794130489282
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 10,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.2625,
                        "precision": 0.8,
                        "recall": 0.06451612903225806,
                        "f1": 0.11940298507462686,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.725,
                        "true_positive_rate": 0.05
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 1,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.23125,
                        "precision": 1.0,
                        "recall": 0.008064516129032258,
                        "f1": 0.016,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.76875,
                        "true_positive_rate": 0.00625
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 3,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.24375,
                        "precision": 1.0,
                        "recall": 0.024193548387096774,
                        "f1": 0.047244094488188976,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.75625,
                        "true_positive_rate": 0.01875
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 0,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.225,
                        "precision": 0.0,
                        "recall": 0.0,
                        "f1": 0.0,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.775,
                        "true_positive_rate": 0.0
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.240625,
                            "stdev": 0.014320549046737002
                        },
                        "precision": {
                            "average": 0.7,
                            "stdev": 0.412310562561766
                        },
                        "recall": {
                            "average": 0.024193548387096774,
                            "stdev": 0.02485650807648781
                        },
                        "f1": {
                            "average": 0.04566176989070396,
                            "stdev": 0.045839634571334935
                        },
                        "true_negative_rate": {
                            "average": 0.221875,
                            "stdev": 0.005412658773652747
                        },
                        "false_positive_rate": {
                            "average": 0.003125,
                            "stdev": 0.0054126587736527424
                        },
                        "false_negative_rate": {
                            "average": 0.75625,
                            "stdev": 0.019263793759278072
                        },
                        "true_positive_rate": {
                            "average": 0.01875,
                            "stdev": 0.01926379375927805
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 10,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.25,
                        "precision": 0.7,
                        "recall": 0.056451612903225805,
                        "f1": 0.1044776119402985,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.73125,
                        "true_positive_rate": 0.04375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 2,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.2375,
                        "precision": 1.0,
                        "recall": 0.016129032258064516,
                        "f1": 0.031746031746031744,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.7625,
                        "true_positive_rate": 0.0125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 1,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.23125,
                        "precision": 1.0,
                        "recall": 0.008064516129032258,
                        "f1": 0.016,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.76875,
                        "true_positive_rate": 0.00625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 0,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.225,
                        "precision": 0.0,
                        "recall": 0.0,
                        "f1": 0.0,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.775,
                        "true_positive_rate": 0.0
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.2359375,
                            "stdev": 0.009243874661093146
                        },
                        "precision": {
                            "average": 0.675,
                            "stdev": 0.4085033659592048
                        },
                        "recall": {
                            "average": 0.02016129032258064,
                            "stdev": 0.02171437422231655
                        },
                        "f1": {
                            "average": 0.03805591092158256,
                            "stdev": 0.03995739138157655
                        },
                        "true_negative_rate": {
                            "average": 0.2203125,
                            "stdev": 0.008118988160479118
                        },
                        "false_positive_rate": {
                            "average": 0.0046875,
                            "stdev": 0.008118988160479111
                        },
                        "false_negative_rate": {
                            "average": 0.759375,
                            "stdev": 0.016828640022295352
                        },
                        "true_positive_rate": {
                            "average": 0.015624999999999998,
                            "stdev": 0.016828640022295324
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 36,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.325,
                        "precision": 0.7222222222222222,
                        "recall": 0.20967741935483872,
                        "f1": 0.325,
                        "true_negative_rate": 0.1625,
                        "false_positive_rate": 0.0625,
                        "false_negative_rate": 0.6125,
                        "true_positive_rate": 0.1625
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 5,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.24375,
                        "precision": 0.8,
                        "recall": 0.03225806451612903,
                        "f1": 0.06201550387596899,
                        "true_negative_rate": 0.21875,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.75,
                        "true_positive_rate": 0.025
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 22,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.275,
                        "precision": 0.6818181818181818,
                        "recall": 0.12096774193548387,
                        "f1": 0.2054794520547945,
                        "true_negative_rate": 0.18125,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.68125,
                        "true_positive_rate": 0.09375
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 4,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.225,
                        "precision": 0.5,
                        "recall": 0.016129032258064516,
                        "f1": 0.03125,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.7625,
                        "true_positive_rate": 0.0125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.2671875,
                            "stdev": 0.03785637949870537
                        },
                        "precision": {
                            "average": 0.676010101010101,
                            "stdev": 0.1101389402739879
                        },
                        "recall": {
                            "average": 0.09475806451612903,
                            "stdev": 0.07743090471641742
                        },
                        "f1": {
                            "average": 0.15593623898269088,
                            "stdev": 0.1176911720621617
                        },
                        "true_negative_rate": {
                            "average": 0.19375,
                            "stdev": 0.022963966338592292
                        },
                        "false_positive_rate": {
                            "average": 0.03125,
                            "stdev": 0.022963966338592295
                        },
                        "false_negative_rate": {
                            "average": 0.7015625000000001,
                            "stdev": 0.06000895115522347
                        },
                        "true_positive_rate": {
                            "average": 0.0734375,
                            "stdev": 0.0600089511552235
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 23,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.36875,
                        "precision": 1.0,
                        "recall": 0.18548387096774194,
                        "f1": 0.3129251700680272,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.63125,
                        "true_positive_rate": 0.14375
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 17,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.33125,
                        "precision": 1.0,
                        "recall": 0.13709677419354838,
                        "f1": 0.24113475177304963,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.66875,
                        "true_positive_rate": 0.10625
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 25,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.38125,
                        "precision": 1.0,
                        "recall": 0.20161290322580644,
                        "f1": 0.33557046979865773,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.61875,
                        "true_positive_rate": 0.15625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 10,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.2875,
                        "precision": 1.0,
                        "recall": 0.08064516129032258,
                        "f1": 0.14925373134328357,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.7125,
                        "true_positive_rate": 0.0625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3421875,
                            "stdev": 0.03654379863602032
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.15120967741935484,
                            "stdev": 0.04715328856260685
                        },
                        "f1": {
                            "average": 0.25972103074575453,
                            "stdev": 0.0726847505481553
                        },
                        "true_negative_rate": {
                            "average": 0.225,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6578124999999999,
                            "stdev": 0.036543798636020315
                        },
                        "true_positive_rate": {
                            "average": 0.1171875,
                            "stdev": 0.03654379863602031
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 62,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.575,
                        "precision": 0.9516129032258065,
                        "recall": 0.47580645161290325,
                        "f1": 0.6344086021505376,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.40625,
                        "true_positive_rate": 0.36875
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 68,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.6,
                        "precision": 0.9411764705882353,
                        "recall": 0.5161290322580645,
                        "f1": 0.6666666666666666,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.375,
                        "true_positive_rate": 0.4
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 59,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.55625,
                        "precision": 0.9491525423728814,
                        "recall": 0.45161290322580644,
                        "f1": 0.6120218579234973,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.425,
                        "true_positive_rate": 0.35
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 63,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.59375,
                        "precision": 0.9682539682539683,
                        "recall": 0.49193548387096775,
                        "f1": 0.6524064171122995,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.39375,
                        "true_positive_rate": 0.38125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5812499999999999,
                            "stdev": 0.017116329922036433
                        },
                        "precision": {
                            "average": 0.9525489711102229,
                            "stdev": 0.009853840046648483
                        },
                        "recall": {
                            "average": 0.4838709677419355,
                            "stdev": 0.02351190280179556
                        },
                        "f1": {
                            "average": 0.6413758859632502,
                            "stdev": 0.020441980162792056
                        },
                        "true_negative_rate": {
                            "average": 0.20625000000000002,
                            "stdev": 0.004419417382415916
                        },
                        "false_positive_rate": {
                            "average": 0.01875,
                            "stdev": 0.004419417382415923
                        },
                        "false_negative_rate": {
                            "average": 0.4,
                            "stdev": 0.018221724671391562
                        },
                        "true_positive_rate": {
                            "average": 0.375,
                            "stdev": 0.018221724671391576
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=baseline_errordetection_prompt_1": {
                    "total_num": 160,
                    "prediction_error_num": 88,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.6875,
                        "precision": 0.9204545454545454,
                        "recall": 0.6532258064516129,
                        "f1": 0.7641509433962265,
                        "true_negative_rate": 0.18125,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.26875,
                        "true_positive_rate": 0.50625
                    }
                },
                "prompt=baseline_errordetection_prompt_2": {
                    "total_num": 160,
                    "prediction_error_num": 102,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.75,
                        "precision": 0.9117647058823529,
                        "recall": 0.75,
                        "f1": 0.8230088495575221,
                        "true_negative_rate": 0.16875,
                        "false_positive_rate": 0.05625,
                        "false_negative_rate": 0.19375,
                        "true_positive_rate": 0.58125
                    }
                },
                "prompt=baseline_errordetection_prompt_3": {
                    "total_num": 160,
                    "prediction_error_num": 76,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.6625,
                        "precision": 0.9605263157894737,
                        "recall": 0.5887096774193549,
                        "f1": 0.73,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.31875,
                        "true_positive_rate": 0.45625
                    }
                },
                "prompt=baseline_errordetection_prompt_4": {
                    "total_num": 160,
                    "prediction_error_num": 73,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.64375,
                        "precision": 0.958904109589041,
                        "recall": 0.5645161290322581,
                        "f1": 0.7106598984771574,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.3375,
                        "true_positive_rate": 0.4375
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6859375000000001,
                            "stdev": 0.04011093016560448
                        },
                        "precision": {
                            "average": 0.9379124191788532,
                            "stdev": 0.022025664869663175
                        },
                        "recall": {
                            "average": 0.6391129032258065,
                            "stdev": 0.07176399891353903
                        },
                        "f1": {
                            "average": 0.7569549228577265,
                            "stdev": 0.04267523020651428
                        },
                        "true_negative_rate": {
                            "average": 0.190625,
                            "stdev": 0.016237976320958216
                        },
                        "false_positive_rate": {
                            "average": 0.034375,
                            "stdev": 0.016237976320958226
                        },
                        "false_negative_rate": {
                            "average": 0.2796875,
                            "stdev": 0.05561709915799277
                        },
                        "true_positive_rate": {
                            "average": 0.4953125,
                            "stdev": 0.05561709915799279
                        }
                    }
                }
            }
        }
    },
    "average": {
        "initial_model=gpt-4-0613": {
            "baseline_model=google/gemma-7b-it": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4845238095238096,
                            "stdev": 0.047499850817285694
                        },
                        "precision": {
                            "average": 0.5786465528548593,
                            "stdev": 0.013173237556182403
                        },
                        "recall": {
                            "average": 0.5726238781794337,
                            "stdev": 0.051938383310716395
                        },
                        "f1": {
                            "average": 0.5357040046796372,
                            "stdev": 0.06583793760215702
                        },
                        "true_negative_rate": {
                            "average": 0.1875,
                            "stdev": 0.015361295119718978
                        },
                        "false_positive_rate": {
                            "average": 0.2505952380952381,
                            "stdev": 0.01799549578313212
                        },
                        "false_negative_rate": {
                            "average": 0.2398809523809524,
                            "stdev": 0.026976831413392067
                        },
                        "true_positive_rate": {
                            "average": 0.3220238095238095,
                            "stdev": 0.03228583719600148
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5005952380952382,
                            "stdev": 0.029534848166798303
                        },
                        "precision": {
                            "average": 0.5532605739010106,
                            "stdev": 0.01997234771699683
                        },
                        "recall": {
                            "average": 0.6832267526711971,
                            "stdev": 0.11245799131274141
                        },
                        "f1": {
                            "average": 0.5918729591353422,
                            "stdev": 0.060884927485138754
                        },
                        "true_negative_rate": {
                            "average": 0.13452380952380952,
                            "stdev": 0.05788824364959144
                        },
                        "false_positive_rate": {
                            "average": 0.30357142857142855,
                            "stdev": 0.04937965512370509
                        },
                        "false_negative_rate": {
                            "average": 0.1767857142857143,
                            "stdev": 0.059672433499295886
                        },
                        "true_positive_rate": {
                            "average": 0.3851190476190476,
                            "stdev": 0.07059533847258584
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5452380952380952,
                            "stdev": 0.005892556509887924
                        },
                        "precision": {
                            "average": 0.6078295912319295,
                            "stdev": 0.04990714975735387
                        },
                        "recall": {
                            "average": 0.7912057078723745,
                            "stdev": 0.03282781787743541
                        },
                        "f1": {
                            "average": 0.6220639132795718,
                            "stdev": 0.04833613961354624
                        },
                        "true_negative_rate": {
                            "average": 0.10178571428571427,
                            "stdev": 0.024353895887474743
                        },
                        "false_positive_rate": {
                            "average": 0.3363095238095238,
                            "stdev": 0.022603378031008974
                        },
                        "false_negative_rate": {
                            "average": 0.11726190476190475,
                            "stdev": 0.017995495783132108
                        },
                        "true_positive_rate": {
                            "average": 0.4446428571428571,
                            "stdev": 0.021773025188299842
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5023809523809524,
                            "stdev": 0.03850228959237193
                        },
                        "precision": {
                            "average": 0.6163680444759458,
                            "stdev": 0.08340786288529353
                        },
                        "recall": {
                            "average": 0.3705158010713567,
                            "stdev": 0.04822014605328429
                        },
                        "f1": {
                            "average": 0.43049069306469584,
                            "stdev": 0.03109885328719537
                        },
                        "true_negative_rate": {
                            "average": 0.30952380952380953,
                            "stdev": 0.038030078202293545
                        },
                        "false_positive_rate": {
                            "average": 0.1285714285714286,
                            "stdev": 0.034472457703738704
                        },
                        "false_negative_rate": {
                            "average": 0.3541666666666667,
                            "stdev": 0.03376632048495915
                        },
                        "true_positive_rate": {
                            "average": 0.20773809523809525,
                            "stdev": 0.024061170156437786
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.531547619047619,
                            "stdev": 0.016772622385244473
                        },
                        "precision": {
                            "average": 0.6220351455131126,
                            "stdev": 0.0496461785625971
                        },
                        "recall": {
                            "average": 0.5517152600485934,
                            "stdev": 0.05253044396526141
                        },
                        "f1": {
                            "average": 0.523476107727478,
                            "stdev": 0.028600308575813728
                        },
                        "true_negative_rate": {
                            "average": 0.22738095238095238,
                            "stdev": 0.05237418787490223
                        },
                        "false_positive_rate": {
                            "average": 0.21071428571428572,
                            "stdev": 0.04899066272274346
                        },
                        "false_negative_rate": {
                            "average": 0.2517857142857143,
                            "stdev": 0.028719852575749574
                        },
                        "true_positive_rate": {
                            "average": 0.3101190476190476,
                            "stdev": 0.030906559489874218
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4851190476190476,
                            "stdev": 0.02649979677327627
                        },
                        "precision": {
                            "average": 0.6545056543697848,
                            "stdev": 0.08475903538706132
                        },
                        "recall": {
                            "average": 0.16960457238235016,
                            "stdev": 0.055315734085078935
                        },
                        "f1": {
                            "average": 0.24498069189880037,
                            "stdev": 0.06315123913037438
                        },
                        "true_negative_rate": {
                            "average": 0.39107142857142857,
                            "stdev": 0.009560939526314875
                        },
                        "false_positive_rate": {
                            "average": 0.04702380952380952,
                            "stdev": 0.019579624051664912
                        },
                        "false_negative_rate": {
                            "average": 0.4672619047619048,
                            "stdev": 0.04124789556921531
                        },
                        "true_positive_rate": {
                            "average": 0.09464285714285715,
                            "stdev": 0.029269740126533925
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5380952380952381,
                            "stdev": 0.06715383226747834
                        },
                        "precision": {
                            "average": 0.5950913451914369,
                            "stdev": 0.09420629920547871
                        },
                        "recall": {
                            "average": 0.40849479738368627,
                            "stdev": 0.19031076578159606
                        },
                        "f1": {
                            "average": 0.43605290732425045,
                            "stdev": 0.14728226698573288
                        },
                        "true_negative_rate": {
                            "average": 0.3107142857142857,
                            "stdev": 0.03679900360969935
                        },
                        "false_positive_rate": {
                            "average": 0.12738095238095237,
                            "stdev": 0.04884580533078247
                        },
                        "false_negative_rate": {
                            "average": 0.3345238095238095,
                            "stdev": 0.11387886411274911
                        },
                        "true_positive_rate": {
                            "average": 0.22738095238095238,
                            "stdev": 0.10248126315113486
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5196428571428572,
                            "stdev": 0.005050762722761088
                        },
                        "precision": {
                            "average": 0.5977696502155331,
                            "stdev": 0.12308261655442757
                        },
                        "recall": {
                            "average": 0.3736993459215681,
                            "stdev": 0.03949788444245618
                        },
                        "f1": {
                            "average": 0.40820826871209254,
                            "stdev": 0.030891199342275973
                        },
                        "true_negative_rate": {
                            "average": 0.31011904761904757,
                            "stdev": 0.02325241069598169
                        },
                        "false_positive_rate": {
                            "average": 0.12797619047619047,
                            "stdev": 0.034942047908188494
                        },
                        "false_negative_rate": {
                            "average": 0.3523809523809524,
                            "stdev": 0.029892570308308844
                        },
                        "true_positive_rate": {
                            "average": 0.20952380952380953,
                            "stdev": 0.018230229560849465
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5708333333333333,
                            "stdev": 0.04960158727618967
                        },
                        "precision": {
                            "average": 0.778177815775583,
                            "stdev": 0.09185886489963105
                        },
                        "recall": {
                            "average": 0.35075880909214235,
                            "stdev": 0.05859108543130196
                        },
                        "f1": {
                            "average": 0.4621150416439284,
                            "stdev": 0.05768585780118668
                        },
                        "true_negative_rate": {
                            "average": 0.3744047619047619,
                            "stdev": 0.032021375372173255
                        },
                        "false_positive_rate": {
                            "average": 0.06369047619047619,
                            "stdev": 0.025685058345704066
                        },
                        "false_negative_rate": {
                            "average": 0.3654761904761905,
                            "stdev": 0.04088553514541527
                        },
                        "true_positive_rate": {
                            "average": 0.19642857142857142,
                            "stdev": 0.02948682399166867
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5541666666666667,
                            "stdev": 0.10447403631064779
                        },
                        "precision": {
                            "average": 0.8970099345099346,
                            "stdev": 0.009937366546131513
                        },
                        "recall": {
                            "average": 0.23566608288830507,
                            "stdev": 0.19971120649537966
                        },
                        "f1": {
                            "average": 0.33067037146300593,
                            "stdev": 0.2296647326916274
                        },
                        "true_negative_rate": {
                            "average": 0.42321428571428577,
                            "stdev": 0.010206207261596566
                        },
                        "false_positive_rate": {
                            "average": 0.014880952380952382,
                            "stdev": 0.012227761062008412
                        },
                        "false_negative_rate": {
                            "average": 0.4309523809523809,
                            "stdev": 0.11670006810491416
                        },
                        "true_positive_rate": {
                            "average": 0.13095238095238093,
                            "stdev": 0.10905372721508025
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5625,
                            "stdev": 0.09975522764335143
                        },
                        "precision": {
                            "average": 0.8431830497142817,
                            "stdev": 0.05179244377784786
                        },
                        "recall": {
                            "average": 0.2871922316366761,
                            "stdev": 0.22868702253301998
                        },
                        "f1": {
                            "average": 0.37549287624635763,
                            "stdev": 0.22409794927140836
                        },
                        "true_negative_rate": {
                            "average": 0.40297619047619054,
                            "stdev": 0.025351832665531957
                        },
                        "false_positive_rate": {
                            "average": 0.03511904761904762,
                            "stdev": 0.03345005075768361
                        },
                        "false_negative_rate": {
                            "average": 0.40238095238095245,
                            "stdev": 0.13303538137808565
                        },
                        "true_positive_rate": {
                            "average": 0.1595238095238095,
                            "stdev": 0.1247219128924647
                        }
                    }
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=google/gemma-7b-it": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4791666666666667,
                            "stdev": 0.0963555743140421
                        },
                        "precision": {
                            "average": 0.7722613690273471,
                            "stdev": 0.039919407151842244
                        },
                        "recall": {
                            "average": 0.45574436763952897,
                            "stdev": 0.15804325315025108
                        },
                        "f1": {
                            "average": 0.5204355560956593,
                            "stdev": 0.13432834339415145
                        },
                        "true_negative_rate": {
                            "average": 0.13333333333333333,
                            "stdev": 0.02870732037400139
                        },
                        "false_positive_rate": {
                            "average": 0.09583333333333333,
                            "stdev": 0.03312925811206093
                        },
                        "false_negative_rate": {
                            "average": 0.4192708333333333,
                            "stdev": 0.12237034309037281
                        },
                        "true_positive_rate": {
                            "average": 0.3515625,
                            "stdev": 0.12407863560057389
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6036458333333333,
                            "stdev": 0.0832714614061878
                        },
                        "precision": {
                            "average": 0.7771209618775923,
                            "stdev": 0.02315498344641413
                        },
                        "recall": {
                            "average": 0.6867447516641065,
                            "stdev": 0.12914826699319162
                        },
                        "f1": {
                            "average": 0.6978109937788135,
                            "stdev": 0.10309317375303076
                        },
                        "true_negative_rate": {
                            "average": 0.08020833333333333,
                            "stdev": 0.029490390972691804
                        },
                        "false_positive_rate": {
                            "average": 0.14895833333333333,
                            "stdev": 0.015220457496044216
                        },
                        "false_negative_rate": {
                            "average": 0.23958333333333334,
                            "stdev": 0.09362259861925547
                        },
                        "true_positive_rate": {
                            "average": 0.53125,
                            "stdev": 0.10828324162430063
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6161458333333334,
                            "stdev": 0.08333007806141925
                        },
                        "precision": {
                            "average": 0.7948494801309672,
                            "stdev": 0.044773639362768645
                        },
                        "recall": {
                            "average": 0.6720632787165045,
                            "stdev": 0.15269752903599607
                        },
                        "f1": {
                            "average": 0.6584034725035807,
                            "stdev": 0.12422701630659598
                        },
                        "true_negative_rate": {
                            "average": 0.09999999999999999,
                            "stdev": 0.037781064756921105
                        },
                        "false_positive_rate": {
                            "average": 0.12916666666666668,
                            "stdev": 0.03447743575715702
                        },
                        "false_negative_rate": {
                            "average": 0.2526041666666667,
                            "stdev": 0.11849158652152854
                        },
                        "true_positive_rate": {
                            "average": 0.5182291666666666,
                            "stdev": 0.12018278541339533
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4072916666666668,
                            "stdev": 0.07744845956433793
                        },
                        "precision": {
                            "average": 0.8895345007112189,
                            "stdev": 0.05379985627977447
                        },
                        "recall": {
                            "average": 0.30399812254650965,
                            "stdev": 0.11049385282899198
                        },
                        "f1": {
                            "average": 0.4097592653850133,
                            "stdev": 0.11261286873705548
                        },
                        "true_negative_rate": {
                            "average": 0.18802083333333333,
                            "stdev": 0.007475364632503822
                        },
                        "false_positive_rate": {
                            "average": 0.04114583333333333,
                            "stdev": 0.02113069039072999
                        },
                        "false_negative_rate": {
                            "average": 0.5380208333333334,
                            "stdev": 0.09366605037608644
                        },
                        "true_positive_rate": {
                            "average": 0.2328125,
                            "stdev": 0.07980485639462467
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5411458333333333,
                            "stdev": 0.04801281115204799
                        },
                        "precision": {
                            "average": 0.8234908364477155,
                            "stdev": 0.08309088229242899
                        },
                        "recall": {
                            "average": 0.5132915173237754,
                            "stdev": 0.07021443029158354
                        },
                        "f1": {
                            "average": 0.5395402883926673,
                            "stdev": 0.08240029257429192
                        },
                        "true_negative_rate": {
                            "average": 0.14531249999999998,
                            "stdev": 0.040764983186144785
                        },
                        "false_positive_rate": {
                            "average": 0.08385416666666667,
                            "stdev": 0.025334138556281898
                        },
                        "false_negative_rate": {
                            "average": 0.3744791666666667,
                            "stdev": 0.05042678101024844
                        },
                        "true_positive_rate": {
                            "average": 0.39635416666666673,
                            "stdev": 0.06019625743949719
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3510416666666667,
                            "stdev": 0.118622007045723
                        },
                        "precision": {
                            "average": 0.7904003937670186,
                            "stdev": 0.10312703828826682
                        },
                        "recall": {
                            "average": 0.17565710872162485,
                            "stdev": 0.15680935881972527
                        },
                        "f1": {
                            "average": 0.24854620853900375,
                            "stdev": 0.1973692485515418
                        },
                        "true_negative_rate": {
                            "average": 0.2177083333333333,
                            "stdev": 0.005892556509887888
                        },
                        "false_positive_rate": {
                            "average": 0.011458333333333333,
                            "stdev": 0.01178511301977579
                        },
                        "false_negative_rate": {
                            "average": 0.6375000000000001,
                            "stdev": 0.13020416659999784
                        },
                        "true_positive_rate": {
                            "average": 0.13333333333333333,
                            "stdev": 0.11685253051983466
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.42656249999999996,
                            "stdev": 0.16754488670950238
                        },
                        "precision": {
                            "average": 0.8303074942791762,
                            "stdev": 0.1272576473307861
                        },
                        "recall": {
                            "average": 0.3110193719064687,
                            "stdev": 0.27007370834969724
                        },
                        "f1": {
                            "average": 0.3661628381498477,
                            "stdev": 0.27766917153374227
                        },
                        "true_negative_rate": {
                            "average": 0.19010416666666666,
                            "stdev": 0.035270839486118775
                        },
                        "false_positive_rate": {
                            "average": 0.03906249999999999,
                            "stdev": 0.0497225374369276
                        },
                        "false_negative_rate": {
                            "average": 0.5343749999999999,
                            "stdev": 0.21384379033927237
                        },
                        "true_positive_rate": {
                            "average": 0.23645833333333335,
                            "stdev": 0.20164256054271437
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3979166666666667,
                            "stdev": 0.10238316935930215
                        },
                        "precision": {
                            "average": 0.7987074825830668,
                            "stdev": 0.08740571192694088
                        },
                        "recall": {
                            "average": 0.295045016214371,
                            "stdev": 0.15519410726049288
                        },
                        "f1": {
                            "average": 0.36358160713564475,
                            "stdev": 0.15355313817769015
                        },
                        "true_negative_rate": {
                            "average": 0.171875,
                            "stdev": 0.015520483776824317
                        },
                        "false_positive_rate": {
                            "average": 0.057291666666666664,
                            "stdev": 0.023152198719680068
                        },
                        "false_negative_rate": {
                            "average": 0.5447916666666667,
                            "stdev": 0.12513230845011838
                        },
                        "true_positive_rate": {
                            "average": 0.2260416666666667,
                            "stdev": 0.11606289078794976
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5203125000000001,
                            "stdev": 0.17216358348723657
                        },
                        "precision": {
                            "average": 0.968952973051675,
                            "stdev": 0.03863584186018477
                        },
                        "recall": {
                            "average": 0.40747994538317117,
                            "stdev": 0.24959641020808776
                        },
                        "f1": {
                            "average": 0.5091692906597366,
                            "stdev": 0.23070397598440856
                        },
                        "true_negative_rate": {
                            "average": 0.209375,
                            "stdev": 0.01275775907699572
                        },
                        "false_positive_rate": {
                            "average": 0.01979166666666667,
                            "stdev": 0.025811482694508055
                        },
                        "false_negative_rate": {
                            "average": 0.45989583333333334,
                            "stdev": 0.19728930172816994
                        },
                        "true_positive_rate": {
                            "average": 0.3109375,
                            "stdev": 0.18461760169108107
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5953124999999999,
                            "stdev": 0.18206980546620022
                        },
                        "precision": {
                            "average": 0.9107089007920998,
                            "stdev": 0.04555486728416541
                        },
                        "recall": {
                            "average": 0.5209794760197987,
                            "stdev": 0.23624500746164345
                        },
                        "f1": {
                            "average": 0.632646728172738,
                            "stdev": 0.20221222162743843
                        },
                        "true_negative_rate": {
                            "average": 0.19739583333333333,
                            "stdev": 0.0114346345833855
                        },
                        "false_positive_rate": {
                            "average": 0.03177083333333334,
                            "stdev": 0.01085034721666649
                        },
                        "false_negative_rate": {
                            "average": 0.3729166666666666,
                            "stdev": 0.18851432812845348
                        },
                        "true_positive_rate": {
                            "average": 0.3979166666666667,
                            "stdev": 0.17299038338316833
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.7088541666666667,
                            "stdev": 0.09704566060909445
                        },
                        "precision": {
                            "average": 0.9214333448860926,
                            "stdev": 0.014427210122817925
                        },
                        "recall": {
                            "average": 0.6859873698583376,
                            "stdev": 0.14250211655763087
                        },
                        "f1": {
                            "average": 0.7749419123378162,
                            "stdev": 0.08774220950625859
                        },
                        "true_negative_rate": {
                            "average": 0.18229166666666666,
                            "stdev": 0.005892556509887901
                        },
                        "false_positive_rate": {
                            "average": 0.046875,
                            "stdev": 0.01767766952966369
                        },
                        "false_negative_rate": {
                            "average": 0.24427083333333333,
                            "stdev": 0.11378271431280275
                        },
                        "true_positive_rate": {
                            "average": 0.5265625,
                            "stdev": 0.09820160226544171
                        }
                    }
                }
            }
        }
    }
}