{
    "math_word_problem_generation": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 9,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.32142857142857145,
                            "precision": 1.0,
                            "recall": 0.32142857142857145,
                            "f1": 0.4864864864864865,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6785714285714286,
                            "true_positive_rate": 0.32142857142857145
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 24,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8571428571428571,
                            "precision": 1.0,
                            "recall": 0.8571428571428571,
                            "f1": 0.9230769230769231,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.14285714285714285,
                            "true_positive_rate": 0.8571428571428571
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 24,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8571428571428571,
                            "precision": 1.0,
                            "recall": 0.8571428571428571,
                            "f1": 0.9230769230769231,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.14285714285714285,
                            "true_positive_rate": 0.8571428571428571
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 1,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.03571428571428571,
                            "precision": 1.0,
                            "recall": 0.03571428571428571,
                            "f1": 0.06896551724137931,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9642857142857143,
                            "true_positive_rate": 0.03571428571428571
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5178571428571428,
                                "stdev": 0.354004064314268
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5178571428571428,
                                "stdev": 0.354004064314268
                            },
                            "f1": {
                                "average": 0.600401462470428,
                                "stdev": 0.3548378824865455
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4821428571428571,
                                "stdev": 0.354004064314268
                            },
                            "true_positive_rate": {
                                "average": 0.5178571428571428,
                                "stdev": 0.354004064314268
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 18,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6428571428571429,
                            "precision": 1.0,
                            "recall": 0.6428571428571429,
                            "f1": 0.782608695652174,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.35714285714285715,
                            "true_positive_rate": 0.6428571428571429
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 16,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5714285714285714,
                            "precision": 1.0,
                            "recall": 0.5714285714285714,
                            "f1": 0.7272727272727273,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.42857142857142855,
                            "true_positive_rate": 0.5714285714285714
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 19,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6785714285714286,
                            "precision": 1.0,
                            "recall": 0.6785714285714286,
                            "f1": 0.8085106382978723,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.32142857142857145,
                            "true_positive_rate": 0.6785714285714286
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 5,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.17857142857142858,
                            "precision": 1.0,
                            "recall": 0.17857142857142858,
                            "f1": 0.30303030303030304,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8214285714285714,
                            "true_positive_rate": 0.17857142857142858
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5178571428571429,
                                "stdev": 0.19964892656248123
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5178571428571429,
                                "stdev": 0.19964892656248123
                            },
                            "f1": {
                                "average": 0.6553555910632692,
                                "stdev": 0.2055206827724588
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.48214285714285715,
                                "stdev": 0.1996489265624812
                            },
                            "true_positive_rate": {
                                "average": 0.5178571428571429,
                                "stdev": 0.19964892656248123
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 0,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "f1": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.25,
                                "stdev": 0.4330127018922193
                            },
                            "true_positive_rate": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 15,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5357142857142857,
                            "precision": 1.0,
                            "recall": 0.5357142857142857,
                            "f1": 0.6976744186046512,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4642857142857143,
                            "true_positive_rate": 0.5357142857142857
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 15,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5357142857142857,
                            "precision": 1.0,
                            "recall": 0.5357142857142857,
                            "f1": 0.6976744186046512,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4642857142857143,
                            "true_positive_rate": 0.5357142857142857
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 8,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.2857142857142857,
                            "precision": 1.0,
                            "recall": 0.2857142857142857,
                            "f1": 0.4444444444444444,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7142857142857143,
                            "true_positive_rate": 0.2857142857142857
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 4,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.14285714285714285,
                            "precision": 1.0,
                            "recall": 0.14285714285714285,
                            "f1": 0.25,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8571428571428571,
                            "true_positive_rate": 0.14285714285714285
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.375,
                                "stdev": 0.16846394878672508
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.375,
                                "stdev": 0.16846394878672508
                            },
                            "f1": {
                                "average": 0.5224483204134367,
                                "stdev": 0.1882292903196278
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.625,
                                "stdev": 0.16846394878672505
                            },
                            "true_positive_rate": {
                                "average": 0.375,
                                "stdev": 0.16846394878672508
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 26,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.9285714285714286,
                            "precision": 1.0,
                            "recall": 0.9285714285714286,
                            "f1": 0.9629629629629629,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.07142857142857142,
                            "true_positive_rate": 0.9285714285714286
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 20,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.7142857142857143,
                            "precision": 1.0,
                            "recall": 0.7142857142857143,
                            "f1": 0.8333333333333334,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2857142857142857,
                            "true_positive_rate": 0.7142857142857143
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 5,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.17857142857142858,
                            "precision": 1.0,
                            "recall": 0.17857142857142858,
                            "f1": 0.30303030303030304,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8214285714285714,
                            "true_positive_rate": 0.17857142857142858
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 2,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.07142857142857142,
                            "precision": 1.0,
                            "recall": 0.07142857142857142,
                            "f1": 0.13333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9285714285714286,
                            "true_positive_rate": 0.07142857142857142
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4732142857142857,
                                "stdev": 0.3583684328640978
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4732142857142857,
                                "stdev": 0.3583684328640978
                            },
                            "f1": {
                                "average": 0.5581649831649832,
                                "stdev": 0.3482652225994596
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5267857142857142,
                                "stdev": 0.3583684328640978
                            },
                            "true_positive_rate": {
                                "average": 0.4732142857142857,
                                "stdev": 0.3583684328640978
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 17,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6071428571428571,
                            "precision": 1.0,
                            "recall": 0.6071428571428571,
                            "f1": 0.7555555555555555,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.39285714285714285,
                            "true_positive_rate": 0.6071428571428571
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 4,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.14285714285714285,
                            "precision": 1.0,
                            "recall": 0.14285714285714285,
                            "f1": 0.25,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8571428571428571,
                            "true_positive_rate": 0.14285714285714285
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 7,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 1,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.03571428571428571,
                            "precision": 1.0,
                            "recall": 0.03571428571428571,
                            "f1": 0.06896551724137931,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9642857142857143,
                            "true_positive_rate": 0.03571428571428571
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25892857142857145,
                                "stdev": 0.21484302527707078
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25892857142857145,
                                "stdev": 0.21484302527707078
                            },
                            "f1": {
                                "average": 0.36863026819923367,
                                "stdev": 0.2522733346831762
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7410714285714286,
                                "stdev": 0.2148430252770708
                            },
                            "true_positive_rate": {
                                "average": 0.25892857142857145,
                                "stdev": 0.21484302527707078
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 23,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8214285714285714,
                            "precision": 1.0,
                            "recall": 0.8214285714285714,
                            "f1": 0.9019607843137255,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.17857142857142858,
                            "true_positive_rate": 0.8214285714285714
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 20,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.7142857142857143,
                            "precision": 1.0,
                            "recall": 0.7142857142857143,
                            "f1": 0.8333333333333334,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2857142857142857,
                            "true_positive_rate": 0.7142857142857143
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 7,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6964285714285714,
                                "stdev": 0.27721740529035754
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6964285714285714,
                                "stdev": 0.27721740529035754
                            },
                            "f1": {
                                "average": 0.7838235294117647,
                                "stdev": 0.22937981594192905
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3035714285714286,
                                "stdev": 0.27721740529035754
                            },
                            "true_positive_rate": {
                                "average": 0.6964285714285714,
                                "stdev": 0.27721740529035754
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 21,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.75,
                            "precision": 1.0,
                            "recall": 0.75,
                            "f1": 0.8571428571428571,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25,
                            "true_positive_rate": 0.75
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 3,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.10714285714285714,
                            "precision": 1.0,
                            "recall": 0.10714285714285714,
                            "f1": 0.1935483870967742,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8928571428571429,
                            "true_positive_rate": 0.10714285714285714
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 21,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.75,
                            "precision": 1.0,
                            "recall": 0.75,
                            "f1": 0.8571428571428571,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25,
                            "true_positive_rate": 0.75
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 0,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4017857142857143,
                                "stdev": 0.35026866510594695
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.4017857142857143,
                                "stdev": 0.35026866510594695
                            },
                            "f1": {
                                "average": 0.4769585253456221,
                                "stdev": 0.38629360389809775
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5982142857142857,
                                "stdev": 0.350268665105947
                            },
                            "true_positive_rate": {
                                "average": 0.4017857142857143,
                                "stdev": 0.35026866510594695
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 15,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5357142857142857,
                            "precision": 1.0,
                            "recall": 0.5357142857142857,
                            "f1": 0.6976744186046512,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4642857142857143,
                            "true_positive_rate": 0.5357142857142857
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 9,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.32142857142857145,
                            "precision": 1.0,
                            "recall": 0.32142857142857145,
                            "f1": 0.4864864864864865,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6785714285714286,
                            "true_positive_rate": 0.32142857142857145
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 18,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6428571428571429,
                            "precision": 1.0,
                            "recall": 0.6428571428571429,
                            "f1": 0.782608695652174,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.35714285714285715,
                            "true_positive_rate": 0.6428571428571429
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 6,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.21428571428571427,
                            "precision": 1.0,
                            "recall": 0.21428571428571427,
                            "f1": 0.35294117647058826,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7857142857142857,
                            "true_positive_rate": 0.21428571428571427
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.42857142857142855,
                                "stdev": 0.16940773179473462
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.42857142857142855,
                                "stdev": 0.16940773179473462
                            },
                            "f1": {
                                "average": 0.579927694303475,
                                "stdev": 0.16970416758845785
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5714285714285714,
                                "stdev": 0.16940773179473462
                            },
                            "true_positive_rate": {
                                "average": 0.42857142857142855,
                                "stdev": 0.16940773179473462
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 20,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.7142857142857143,
                            "precision": 1.0,
                            "recall": 0.7142857142857143,
                            "f1": 0.8333333333333334,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2857142857142857,
                            "true_positive_rate": 0.7142857142857143
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 17,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6071428571428571,
                            "precision": 1.0,
                            "recall": 0.6071428571428571,
                            "f1": 0.7555555555555555,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.39285714285714285,
                            "true_positive_rate": 0.6071428571428571
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 16,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5714285714285714,
                            "precision": 1.0,
                            "recall": 0.5714285714285714,
                            "f1": 0.7272727272727273,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.42857142857142855,
                            "true_positive_rate": 0.5714285714285714
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 11,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.39285714285714285,
                            "precision": 1.0,
                            "recall": 0.39285714285714285,
                            "f1": 0.5641025641025641,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6071428571428571,
                            "true_positive_rate": 0.39285714285714285
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5714285714285714,
                                "stdev": 0.11572751247156894
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5714285714285714,
                                "stdev": 0.11572751247156894
                            },
                            "f1": {
                                "average": 0.720066045066045,
                                "stdev": 0.09806315262228248
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4285714285714286,
                                "stdev": 0.11572751247156893
                            },
                            "true_positive_rate": {
                                "average": 0.5714285714285714,
                                "stdev": 0.11572751247156894
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 24,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8571428571428571,
                            "precision": 1.0,
                            "recall": 0.8571428571428571,
                            "f1": 0.9230769230769231,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.14285714285714285,
                            "true_positive_rate": 0.8571428571428571
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 23,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8214285714285714,
                            "precision": 1.0,
                            "recall": 0.8214285714285714,
                            "f1": 0.9019607843137255,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.17857142857142858,
                            "true_positive_rate": 0.8214285714285714
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 22,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.7857142857142857,
                            "precision": 1.0,
                            "recall": 0.7857142857142857,
                            "f1": 0.88,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.21428571428571427,
                            "true_positive_rate": 0.7857142857142857
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 19,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6785714285714286,
                            "precision": 1.0,
                            "recall": 0.6785714285714286,
                            "f1": 0.8085106382978723,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.32142857142857145,
                            "true_positive_rate": 0.6785714285714286
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7857142857142856,
                                "stdev": 0.06681531047810607
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7857142857142856,
                                "stdev": 0.06681531047810607
                            },
                            "f1": {
                                "average": 0.8783870864221303,
                                "stdev": 0.04312255855462447
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2142857142857143,
                                "stdev": 0.06681531047810611
                            },
                            "true_positive_rate": {
                                "average": 0.7857142857142856,
                                "stdev": 0.06681531047810607
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.5251623376623376,
                            "stdev": 0.1568722850393251
                        },
                        "precision": {
                            "average": 0.9545454545454546,
                            "stdev": 0.09642365197998375
                        },
                        "recall": {
                            "average": 0.5251623376623376,
                            "stdev": 0.1568722850393251
                        },
                        "f1": {
                            "average": 0.6267421368963989,
                            "stdev": 0.14149932325414086
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.47483766233766234,
                            "stdev": 0.1568722850393251
                        },
                        "true_positive_rate": {
                            "average": 0.5251623376623376,
                            "stdev": 0.1568722850393251
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 19,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.24050632911392406,
                            "precision": 1.0,
                            "recall": 0.24050632911392406,
                            "f1": 0.3877551020408163,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.759493670886076,
                            "true_positive_rate": 0.24050632911392406
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 64,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.810126582278481,
                            "precision": 1.0,
                            "recall": 0.810126582278481,
                            "f1": 0.8951048951048951,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.189873417721519,
                            "true_positive_rate": 0.810126582278481
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 72,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9113924050632911,
                            "precision": 1.0,
                            "recall": 0.9113924050632911,
                            "f1": 0.9536423841059603,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.08860759493670886,
                            "true_positive_rate": 0.9113924050632911
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 2,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.02531645569620253,
                            "precision": 1.0,
                            "recall": 0.02531645569620253,
                            "f1": 0.04938271604938271,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9746835443037974,
                            "true_positive_rate": 0.02531645569620253
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.49683544303797467,
                                "stdev": 0.3735115737026007
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.49683544303797467,
                                "stdev": 0.3735115737026007
                            },
                            "f1": {
                                "average": 0.5714712743252637,
                                "stdev": 0.373202884598773
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5031645569620253,
                                "stdev": 0.3735115737026007
                            },
                            "true_positive_rate": {
                                "average": 0.49683544303797467,
                                "stdev": 0.3735115737026007
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 60,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.759493670886076,
                            "precision": 1.0,
                            "recall": 0.759493670886076,
                            "f1": 0.8633093525179856,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.24050632911392406,
                            "true_positive_rate": 0.759493670886076
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 18,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.22784810126582278,
                            "precision": 1.0,
                            "recall": 0.22784810126582278,
                            "f1": 0.3711340206185567,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7721518987341772,
                            "true_positive_rate": 0.22784810126582278
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 69,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.8734177215189873,
                            "precision": 1.0,
                            "recall": 0.8734177215189873,
                            "f1": 0.9324324324324325,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.12658227848101267,
                            "true_positive_rate": 0.8734177215189873
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 19,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.24050632911392406,
                            "precision": 1.0,
                            "recall": 0.24050632911392406,
                            "f1": 0.3877551020408163,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.759493670886076,
                            "true_positive_rate": 0.24050632911392406
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5253164556962026,
                                "stdev": 0.2939462916388132
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5253164556962026,
                                "stdev": 0.2939462916388132
                            },
                            "f1": {
                                "average": 0.6386577269024478,
                                "stdev": 0.26042896876464183
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4746835443037975,
                                "stdev": 0.2939462916388132
                            },
                            "true_positive_rate": {
                                "average": 0.5253164556962026,
                                "stdev": 0.2939462916388132
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 78,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9873417721518988,
                            "precision": 1.0,
                            "recall": 0.9873417721518988,
                            "f1": 0.9936305732484076,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.012658227848101266,
                            "true_positive_rate": 0.9873417721518988
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 71,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.8987341772151899,
                            "precision": 1.0,
                            "recall": 0.8987341772151899,
                            "f1": 0.9466666666666667,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.10126582278481013,
                            "true_positive_rate": 0.8987341772151899
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 78,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9873417721518988,
                            "precision": 1.0,
                            "recall": 0.9873417721518988,
                            "f1": 0.9936305732484076,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.012658227848101266,
                            "true_positive_rate": 0.9873417721518988
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 2,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.02531645569620253,
                            "precision": 1.0,
                            "recall": 0.02531645569620253,
                            "f1": 0.04938271604938271,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9746835443037974,
                            "true_positive_rate": 0.02531645569620253
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7246835443037976,
                                "stdev": 0.4053969156138617
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7246835443037976,
                                "stdev": 0.4053969156138617
                            },
                            "f1": {
                                "average": 0.7458276323032162,
                                "stdev": 0.4025495106760237
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.27531645569620256,
                                "stdev": 0.40539691561386165
                            },
                            "true_positive_rate": {
                                "average": 0.7246835443037976,
                                "stdev": 0.4053969156138617
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 55,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.6962025316455697,
                            "precision": 1.0,
                            "recall": 0.6962025316455697,
                            "f1": 0.8208955223880597,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3037974683544304,
                            "true_positive_rate": 0.6962025316455697
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 46,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.5822784810126582,
                            "precision": 1.0,
                            "recall": 0.5822784810126582,
                            "f1": 0.736,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4177215189873418,
                            "true_positive_rate": 0.5822784810126582
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 38,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.4810126582278481,
                            "precision": 1.0,
                            "recall": 0.4810126582278481,
                            "f1": 0.6495726495726496,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5189873417721519,
                            "true_positive_rate": 0.4810126582278481
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 18,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.22784810126582278,
                            "precision": 1.0,
                            "recall": 0.22784810126582278,
                            "f1": 0.3711340206185567,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7721518987341772,
                            "true_positive_rate": 0.22784810126582278
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.49683544303797467,
                                "stdev": 0.17295396736632543
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.49683544303797467,
                                "stdev": 0.17295396736632543
                            },
                            "f1": {
                                "average": 0.6444005481448165,
                                "stdev": 0.16899872870286115
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5031645569620253,
                                "stdev": 0.17295396736632543
                            },
                            "true_positive_rate": {
                                "average": 0.49683544303797467,
                                "stdev": 0.17295396736632543
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 53,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.6708860759493671,
                            "precision": 1.0,
                            "recall": 0.6708860759493671,
                            "f1": 0.803030303030303,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3291139240506329,
                            "true_positive_rate": 0.6708860759493671
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 20,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.25316455696202533,
                            "precision": 1.0,
                            "recall": 0.25316455696202533,
                            "f1": 0.40404040404040403,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7468354430379747,
                            "true_positive_rate": 0.25316455696202533
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 6,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.0759493670886076,
                            "precision": 1.0,
                            "recall": 0.0759493670886076,
                            "f1": 0.1411764705882353,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9240506329113924,
                            "true_positive_rate": 0.0759493670886076
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5,
                                "stdev": 0.36053735110445856
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5,
                                "stdev": 0.36053735110445856
                            },
                            "f1": {
                                "average": 0.5870617944147356,
                                "stdev": 0.33521295831308007
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5,
                                "stdev": 0.3605373511044586
                            },
                            "true_positive_rate": {
                                "average": 0.5,
                                "stdev": 0.36053735110445856
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 53,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.6708860759493671,
                            "precision": 1.0,
                            "recall": 0.6708860759493671,
                            "f1": 0.803030303030303,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3291139240506329,
                            "true_positive_rate": 0.6708860759493671
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 20,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.25316455696202533,
                            "precision": 1.0,
                            "recall": 0.25316455696202533,
                            "f1": 0.40404040404040403,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7468354430379747,
                            "true_positive_rate": 0.25316455696202533
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 42,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.5316455696202531,
                            "precision": 1.0,
                            "recall": 0.5316455696202531,
                            "f1": 0.6942148760330579,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.46835443037974683,
                            "true_positive_rate": 0.5316455696202531
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 15,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.189873417721519,
                            "precision": 1.0,
                            "recall": 0.189873417721519,
                            "f1": 0.3191489361702128,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.810126582278481,
                            "true_positive_rate": 0.189873417721519
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4113924050632911,
                                "stdev": 0.1974237210187623
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4113924050632911,
                                "stdev": 0.1974237210187623
                            },
                            "f1": {
                                "average": 0.5551086298184944,
                                "stdev": 0.19957097086649436
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5886075949367089,
                                "stdev": 0.1974237210187623
                            },
                            "true_positive_rate": {
                                "average": 0.4113924050632911,
                                "stdev": 0.1974237210187623
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 76,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9620253164556962,
                            "precision": 1.0,
                            "recall": 0.9620253164556962,
                            "f1": 0.9806451612903225,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0379746835443038,
                            "true_positive_rate": 0.9620253164556962
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 69,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.8734177215189873,
                            "precision": 1.0,
                            "recall": 0.8734177215189873,
                            "f1": 0.9324324324324325,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.12658227848101267,
                            "true_positive_rate": 0.8734177215189873
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 57,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.7215189873417721,
                            "precision": 1.0,
                            "recall": 0.7215189873417721,
                            "f1": 0.8382352941176471,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.27848101265822783,
                            "true_positive_rate": 0.7215189873417721
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 29,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.3670886075949367,
                            "precision": 1.0,
                            "recall": 0.3670886075949367,
                            "f1": 0.5370370370370371,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6329113924050633,
                            "true_positive_rate": 0.3670886075949367
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7310126582278481,
                                "stdev": 0.22703352986310682
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7310126582278481,
                                "stdev": 0.22703352986310682
                            },
                            "f1": {
                                "average": 0.8220874812193598,
                                "stdev": 0.1723594019466582
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2689873417721519,
                                "stdev": 0.22703352986310688
                            },
                            "true_positive_rate": {
                                "average": 0.7310126582278481,
                                "stdev": 0.22703352986310682
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 65,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.8227848101265823,
                            "precision": 1.0,
                            "recall": 0.8227848101265823,
                            "f1": 0.9027777777777778,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.17721518987341772,
                            "true_positive_rate": 0.8227848101265823
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 24,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.3037974683544304,
                            "precision": 1.0,
                            "recall": 0.3037974683544304,
                            "f1": 0.46601941747572817,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6962025316455697,
                            "true_positive_rate": 0.3037974683544304
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 59,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.7468354430379747,
                            "precision": 1.0,
                            "recall": 0.7468354430379747,
                            "f1": 0.855072463768116,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25316455696202533,
                            "true_positive_rate": 0.7468354430379747
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 7,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.08860759493670886,
                            "precision": 1.0,
                            "recall": 0.08860759493670886,
                            "f1": 0.16279069767441862,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9113924050632911,
                            "true_positive_rate": 0.08860759493670886
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.49050632911392406,
                                "stdev": 0.3051624136458012
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.49050632911392406,
                                "stdev": 0.3051624136458012
                            },
                            "f1": {
                                "average": 0.5966650891740101,
                                "stdev": 0.3024047901825804
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.509493670886076,
                                "stdev": 0.30516241364580116
                            },
                            "true_positive_rate": {
                                "average": 0.49050632911392406,
                                "stdev": 0.3051624136458012
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 64,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.810126582278481,
                            "precision": 1.0,
                            "recall": 0.810126582278481,
                            "f1": 0.8951048951048951,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.189873417721519,
                            "true_positive_rate": 0.810126582278481
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 56,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.7088607594936709,
                            "precision": 1.0,
                            "recall": 0.7088607594936709,
                            "f1": 0.8296296296296296,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2911392405063291,
                            "true_positive_rate": 0.7088607594936709
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 70,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.8860759493670886,
                            "precision": 1.0,
                            "recall": 0.8860759493670886,
                            "f1": 0.9395973154362416,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.11392405063291139,
                            "true_positive_rate": 0.8860759493670886
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 48,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.6075949367088608,
                            "precision": 1.0,
                            "recall": 0.6075949367088608,
                            "f1": 0.7559055118110236,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3924050632911392,
                            "true_positive_rate": 0.6075949367088608
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7531645569620253,
                                "stdev": 0.10495648070744934
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7531645569620253,
                                "stdev": 0.10495648070744934
                            },
                            "f1": {
                                "average": 0.8550593379954475,
                                "stdev": 0.0693333750595046
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2468354430379747,
                                "stdev": 0.10495648070744935
                            },
                            "true_positive_rate": {
                                "average": 0.7531645569620253,
                                "stdev": 0.10495648070744934
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 75,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9493670886075949,
                            "precision": 1.0,
                            "recall": 0.9493670886075949,
                            "f1": 0.974025974025974,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.05063291139240506,
                            "true_positive_rate": 0.9493670886075949
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 74,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9367088607594937,
                            "precision": 1.0,
                            "recall": 0.9367088607594937,
                            "f1": 0.9673202614379085,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.06329113924050633,
                            "true_positive_rate": 0.9367088607594937
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 73,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9240506329113924,
                            "precision": 1.0,
                            "recall": 0.9240506329113924,
                            "f1": 0.9605263157894737,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0759493670886076,
                            "true_positive_rate": 0.9240506329113924
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 67,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.8481012658227848,
                            "precision": 1.0,
                            "recall": 0.8481012658227848,
                            "f1": 0.9178082191780822,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1518987341772152,
                            "true_positive_rate": 0.8481012658227848
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9145569620253164,
                                "stdev": 0.03939841644933144
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9145569620253164,
                                "stdev": 0.03939841644933144
                            },
                            "f1": {
                                "average": 0.9549201926078595,
                                "stdev": 0.021951764049653568
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.08544303797468356,
                                "stdev": 0.03939841644933143
                            },
                            "true_positive_rate": {
                                "average": 0.9145569620253164,
                                "stdev": 0.03939841644933144
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 79,
                        "prediction_error_num": 78,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9873417721518988,
                            "precision": 1.0,
                            "recall": 0.9873417721518988,
                            "f1": 0.9936305732484076,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.012658227848101266,
                            "true_positive_rate": 0.9873417721518988
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 79,
                        "prediction_error_num": 76,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9620253164556962,
                            "precision": 1.0,
                            "recall": 0.9620253164556962,
                            "f1": 0.9806451612903225,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0379746835443038,
                            "true_positive_rate": 0.9620253164556962
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 79,
                        "prediction_error_num": 77,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9746835443037974,
                            "precision": 1.0,
                            "recall": 0.9746835443037974,
                            "f1": 0.9871794871794872,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.02531645569620253,
                            "true_positive_rate": 0.9746835443037974
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 79,
                        "prediction_error_num": 73,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9240506329113924,
                            "precision": 1.0,
                            "recall": 0.9240506329113924,
                            "f1": 0.9605263157894737,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0759493670886076,
                            "true_positive_rate": 0.9240506329113924
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9620253164556962,
                                "stdev": 0.02368137586565785
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9620253164556962,
                                "stdev": 0.02368137586565785
                            },
                            "f1": {
                                "average": 0.9804953843769229,
                                "stdev": 0.01240963889824019
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0379746835443038,
                                "stdev": 0.02368137586565786
                            },
                            "true_positive_rate": {
                                "average": 0.9620253164556962,
                                "stdev": 0.02368137586565785
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.6369390103567318,
                            "stdev": 0.1797963174259988
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6369390103567318,
                            "stdev": 0.1797963174259988
                        },
                        "f1": {
                            "average": 0.7228868264802341,
                            "stdev": 0.14977239443633703
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.36306098964326816,
                            "stdev": 0.17979631742599883
                        },
                        "true_positive_rate": {
                            "average": 0.6369390103567318,
                            "stdev": 0.1797963174259988
                        }
                    }
                }
            }
        },
        "Instruction-Following": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 21,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.3181818181818182,
                            "precision": 1.0,
                            "recall": 0.3181818181818182,
                            "f1": 0.4827586206896552,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6818181818181818,
                            "true_positive_rate": 0.3181818181818182
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 55,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.8333333333333334,
                            "precision": 1.0,
                            "recall": 0.8333333333333334,
                            "f1": 0.9090909090909091,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.16666666666666666,
                            "true_positive_rate": 0.8333333333333334
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 54,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.8181818181818182,
                            "precision": 1.0,
                            "recall": 0.8181818181818182,
                            "f1": 0.9,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.18181818181818182,
                            "true_positive_rate": 0.8181818181818182
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 3,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.045454545454545456,
                            "precision": 1.0,
                            "recall": 0.045454545454545456,
                            "f1": 0.08695652173913043,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9545454545454546,
                            "true_positive_rate": 0.045454545454545456
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5037878787878788,
                                "stdev": 0.33614089676702363
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5037878787878788,
                                "stdev": 0.33614089676702363
                            },
                            "f1": {
                                "average": 0.5947015128799237,
                                "stdev": 0.33999413444506715
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4962121212121212,
                                "stdev": 0.33614089676702363
                            },
                            "true_positive_rate": {
                                "average": 0.5037878787878788,
                                "stdev": 0.33614089676702363
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 41,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.6212121212121212,
                            "precision": 1.0,
                            "recall": 0.6212121212121212,
                            "f1": 0.7663551401869159,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3787878787878788,
                            "true_positive_rate": 0.6212121212121212
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 29,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.4393939393939394,
                            "precision": 1.0,
                            "recall": 0.4393939393939394,
                            "f1": 0.6105263157894737,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5606060606060606,
                            "true_positive_rate": 0.4393939393939394
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 60,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.9090909090909091,
                            "precision": 1.0,
                            "recall": 0.9090909090909091,
                            "f1": 0.9523809523809523,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.09090909090909091,
                            "true_positive_rate": 0.9090909090909091
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 13,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.19696969696969696,
                            "precision": 1.0,
                            "recall": 0.19696969696969696,
                            "f1": 0.3291139240506329,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.803030303030303,
                            "true_positive_rate": 0.19696969696969696
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5416666666666666,
                                "stdev": 0.26009794545610443
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5416666666666666,
                                "stdev": 0.26009794545610443
                            },
                            "f1": {
                                "average": 0.6645940831019936,
                                "stdev": 0.22838939086990903
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4583333333333333,
                                "stdev": 0.26009794545610443
                            },
                            "true_positive_rate": {
                                "average": 0.5416666666666666,
                                "stdev": 0.26009794545610443
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 65,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.9848484848484849,
                            "precision": 1.0,
                            "recall": 0.9848484848484849,
                            "f1": 0.9923664122137404,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.015151515151515152,
                            "true_positive_rate": 0.9848484848484849
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 2,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.030303030303030304,
                            "precision": 1.0,
                            "recall": 0.030303030303030304,
                            "f1": 0.058823529411764705,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9696969696969697,
                            "true_positive_rate": 0.030303030303030304
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7537878787878788,
                                "stdev": 0.4177499691573248
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7537878787878788,
                                "stdev": 0.4177499691573248
                            },
                            "f1": {
                                "average": 0.7627974854063763,
                                "stdev": 0.406451500407264
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.24621212121212122,
                                "stdev": 0.4177499691573248
                            },
                            "true_positive_rate": {
                                "average": 0.7537878787878788,
                                "stdev": 0.4177499691573248
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 35,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.5303030303030303,
                            "precision": 1.0,
                            "recall": 0.5303030303030303,
                            "f1": 0.693069306930693,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4696969696969697,
                            "true_positive_rate": 0.5303030303030303
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 32,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.48484848484848486,
                            "precision": 1.0,
                            "recall": 0.48484848484848486,
                            "f1": 0.6530612244897959,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5151515151515151,
                            "true_positive_rate": 0.48484848484848486
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 21,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.3181818181818182,
                            "precision": 1.0,
                            "recall": 0.3181818181818182,
                            "f1": 0.4827586206896552,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6818181818181818,
                            "true_positive_rate": 0.3181818181818182
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 11,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.16666666666666666,
                            "precision": 1.0,
                            "recall": 0.16666666666666666,
                            "f1": 0.2857142857142857,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8333333333333334,
                            "true_positive_rate": 0.16666666666666666
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.375,
                                "stdev": 0.14388954479714075
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.375,
                                "stdev": 0.14388954479714075
                            },
                            "f1": {
                                "average": 0.5286508594561075,
                                "stdev": 0.1609624499985799
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.625,
                                "stdev": 0.14388954479714075
                            },
                            "true_positive_rate": {
                                "average": 0.375,
                                "stdev": 0.14388954479714075
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 62,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.9393939393939394,
                            "precision": 1.0,
                            "recall": 0.9393939393939394,
                            "f1": 0.96875,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.06060606060606061,
                            "true_positive_rate": 0.9393939393939394
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 46,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.696969696969697,
                            "precision": 1.0,
                            "recall": 0.696969696969697,
                            "f1": 0.8214285714285714,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.30303030303030304,
                            "true_positive_rate": 0.696969696969697
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 14,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.21212121212121213,
                            "precision": 1.0,
                            "recall": 0.21212121212121213,
                            "f1": 0.35,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7878787878787878,
                            "true_positive_rate": 0.21212121212121213
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 9,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.13636363636363635,
                            "precision": 1.0,
                            "recall": 0.13636363636363635,
                            "f1": 0.24,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8636363636363636,
                            "true_positive_rate": 0.13636363636363635
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4962121212121212,
                                "stdev": 0.33425749986606523
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4962121212121212,
                                "stdev": 0.33425749986606523
                            },
                            "f1": {
                                "average": 0.5950446428571428,
                                "stdev": 0.3070052737687273
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5037878787878788,
                                "stdev": 0.33425749986606523
                            },
                            "true_positive_rate": {
                                "average": 0.4962121212121212,
                                "stdev": 0.33425749986606523
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 30,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.45454545454545453,
                            "precision": 1.0,
                            "recall": 0.45454545454545453,
                            "f1": 0.625,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5454545454545454,
                            "true_positive_rate": 0.45454545454545453
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 9,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.13636363636363635,
                            "precision": 1.0,
                            "recall": 0.13636363636363635,
                            "f1": 0.24,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8636363636363636,
                            "true_positive_rate": 0.13636363636363635
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 18,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.2727272727272727,
                            "precision": 1.0,
                            "recall": 0.2727272727272727,
                            "f1": 0.42857142857142855,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7272727272727273,
                            "true_positive_rate": 0.2727272727272727
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 4,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.06060606060606061,
                            "precision": 1.0,
                            "recall": 0.06060606060606061,
                            "f1": 0.11428571428571428,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9393939393939394,
                            "true_positive_rate": 0.06060606060606061
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.23106060606060602,
                                "stdev": 0.14975301061189997
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.23106060606060602,
                                "stdev": 0.14975301061189997
                            },
                            "f1": {
                                "average": 0.35196428571428573,
                                "stdev": 0.19329010593152565
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.768939393939394,
                                "stdev": 0.1497530106119
                            },
                            "true_positive_rate": {
                                "average": 0.23106060606060602,
                                "stdev": 0.14975301061189997
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 64,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.9696969696969697,
                            "precision": 1.0,
                            "recall": 0.9696969696969697,
                            "f1": 0.9846153846153847,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.030303030303030304,
                            "true_positive_rate": 0.9696969696969697
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 53,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.803030303030303,
                            "precision": 1.0,
                            "recall": 0.803030303030303,
                            "f1": 0.8907563025210085,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.19696969696969696,
                            "true_positive_rate": 0.803030303030303
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 37,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.5606060606060606,
                            "precision": 1.0,
                            "recall": 0.5606060606060606,
                            "f1": 0.7184466019417476,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4393939393939394,
                            "true_positive_rate": 0.5606060606060606
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 17,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.25757575757575757,
                            "precision": 1.0,
                            "recall": 0.25757575757575757,
                            "f1": 0.40963855421686746,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7424242424242424,
                            "true_positive_rate": 0.25757575757575757
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6477272727272727,
                                "stdev": 0.2681379436803726
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6477272727272727,
                                "stdev": 0.2681379436803726
                            },
                            "f1": {
                                "average": 0.750864210823752,
                                "stdev": 0.2189150855130967
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3522727272727273,
                                "stdev": 0.2681379436803726
                            },
                            "true_positive_rate": {
                                "average": 0.6477272727272727,
                                "stdev": 0.2681379436803726
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 48,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.7272727272727273,
                            "precision": 1.0,
                            "recall": 0.7272727272727273,
                            "f1": 0.8421052631578947,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2727272727272727,
                            "true_positive_rate": 0.7272727272727273
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 8,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.12121212121212122,
                            "precision": 1.0,
                            "recall": 0.12121212121212122,
                            "f1": 0.21621621621621623,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8787878787878788,
                            "true_positive_rate": 0.12121212121212122
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 48,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.7272727272727273,
                            "precision": 1.0,
                            "recall": 0.7272727272727273,
                            "f1": 0.8421052631578947,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2727272727272727,
                            "true_positive_rate": 0.7272727272727273
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 0,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3939393939393939,
                                "stdev": 0.33607686383058843
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.3939393939393939,
                                "stdev": 0.33607686383058843
                            },
                            "f1": {
                                "average": 0.4751066856330014,
                                "stdev": 0.37487549589784114
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.606060606060606,
                                "stdev": 0.3360768638305884
                            },
                            "true_positive_rate": {
                                "average": 0.3939393939393939,
                                "stdev": 0.33607686383058843
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 28,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.42424242424242425,
                            "precision": 1.0,
                            "recall": 0.42424242424242425,
                            "f1": 0.5957446808510638,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5757575757575758,
                            "true_positive_rate": 0.42424242424242425
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 19,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.2878787878787879,
                            "precision": 1.0,
                            "recall": 0.2878787878787879,
                            "f1": 0.4470588235294118,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7121212121212122,
                            "true_positive_rate": 0.2878787878787879
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 38,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.5757575757575758,
                            "precision": 1.0,
                            "recall": 0.5757575757575758,
                            "f1": 0.7307692307692307,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.42424242424242425,
                            "true_positive_rate": 0.5757575757575758
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 14,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.21212121212121213,
                            "precision": 1.0,
                            "recall": 0.21212121212121213,
                            "f1": 0.35,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7878787878787878,
                            "true_positive_rate": 0.21212121212121213
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.37500000000000006,
                                "stdev": 0.13860738229948474
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.37500000000000006,
                                "stdev": 0.13860738229948474
                            },
                            "f1": {
                                "average": 0.5308931837874266,
                                "stdev": 0.14483326165723262
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.625,
                                "stdev": 0.13860738229948472
                            },
                            "true_positive_rate": {
                                "average": 0.37500000000000006,
                                "stdev": 0.13860738229948474
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 41,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.6212121212121212,
                            "precision": 1.0,
                            "recall": 0.6212121212121212,
                            "f1": 0.7663551401869159,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3787878787878788,
                            "true_positive_rate": 0.6212121212121212
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 36,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.5454545454545454,
                            "precision": 1.0,
                            "recall": 0.5454545454545454,
                            "f1": 0.7058823529411765,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.45454545454545453,
                            "true_positive_rate": 0.5454545454545454
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 33,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.5,
                            "precision": 1.0,
                            "recall": 0.5,
                            "f1": 0.6666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5,
                            "true_positive_rate": 0.5
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 25,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.3787878787878788,
                            "precision": 1.0,
                            "recall": 0.3787878787878788,
                            "f1": 0.5494505494505495,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6212121212121212,
                            "true_positive_rate": 0.3787878787878788
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5113636363636364,
                                "stdev": 0.08794080883518106
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5113636363636364,
                                "stdev": 0.08794080883518106
                            },
                            "f1": {
                                "average": 0.672088677311327,
                                "stdev": 0.07921124784441519
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.48863636363636365,
                                "stdev": 0.08794080883518106
                            },
                            "true_positive_rate": {
                                "average": 0.5113636363636364,
                                "stdev": 0.08794080883518106
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 66,
                        "prediction_error_num": 40,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.6060606060606061,
                            "precision": 1.0,
                            "recall": 0.6060606060606061,
                            "f1": 0.7547169811320755,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3939393939393939,
                            "true_positive_rate": 0.6060606060606061
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 66,
                        "prediction_error_num": 40,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.6060606060606061,
                            "precision": 1.0,
                            "recall": 0.6060606060606061,
                            "f1": 0.7547169811320755,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3939393939393939,
                            "true_positive_rate": 0.6060606060606061
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 66,
                        "prediction_error_num": 41,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.6212121212121212,
                            "precision": 1.0,
                            "recall": 0.6212121212121212,
                            "f1": 0.7663551401869159,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3787878787878788,
                            "true_positive_rate": 0.6212121212121212
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 66,
                        "prediction_error_num": 30,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.45454545454545453,
                            "precision": 1.0,
                            "recall": 0.45454545454545453,
                            "f1": 0.625,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5454545454545454,
                            "true_positive_rate": 0.45454545454545453
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.571969696969697,
                                "stdev": 0.06807651801367966
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.571969696969697,
                                "stdev": 0.06807651801367966
                            },
                            "f1": {
                                "average": 0.7251972756127667,
                                "stdev": 0.05804371172749499
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.428030303030303,
                                "stdev": 0.06807651801367963
                            },
                            "true_positive_rate": {
                                "average": 0.571969696969697,
                                "stdev": 0.06807651801367966
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.4910468319559229,
                            "stdev": 0.13726242602305785
                        },
                        "precision": {
                            "average": 0.9772727272727273,
                            "stdev": 0.07186994682200862
                        },
                        "recall": {
                            "average": 0.4910468319559229,
                            "stdev": 0.13726242602305785
                        },
                        "f1": {
                            "average": 0.604718445689464,
                            "stdev": 0.12101870557617077
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5089531680440771,
                            "stdev": 0.13726242602305785
                        },
                        "true_positive_rate": {
                            "average": 0.4910468319559229,
                            "stdev": 0.13726242602305785
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 30,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.297029702970297,
                            "precision": 1.0,
                            "recall": 0.297029702970297,
                            "f1": 0.4580152671755725,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7029702970297029,
                            "true_positive_rate": 0.297029702970297
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 80,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.7920792079207921,
                            "precision": 1.0,
                            "recall": 0.7920792079207921,
                            "f1": 0.8839779005524862,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2079207920792079,
                            "true_positive_rate": 0.7920792079207921
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 91,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.900990099009901,
                            "precision": 1.0,
                            "recall": 0.900990099009901,
                            "f1": 0.9479166666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.09900990099009901,
                            "true_positive_rate": 0.900990099009901
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 3,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.0297029702970297,
                            "precision": 1.0,
                            "recall": 0.0297029702970297,
                            "f1": 0.057692307692307696,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9702970297029703,
                            "true_positive_rate": 0.0297029702970297
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.504950495049505,
                                "stdev": 0.35650439380967985
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.504950495049505,
                                "stdev": 0.35650439380967985
                            },
                            "f1": {
                                "average": 0.5869005355217581,
                                "stdev": 0.35890819458445566
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.49504950495049505,
                                "stdev": 0.35650439380967985
                            },
                            "true_positive_rate": {
                                "average": 0.504950495049505,
                                "stdev": 0.35650439380967985
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 77,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.7623762376237624,
                            "precision": 1.0,
                            "recall": 0.7623762376237624,
                            "f1": 0.8651685393258427,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2376237623762376,
                            "true_positive_rate": 0.7623762376237624
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 21,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.2079207920792079,
                            "precision": 1.0,
                            "recall": 0.2079207920792079,
                            "f1": 0.3442622950819672,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7920792079207921,
                            "true_positive_rate": 0.2079207920792079
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 83,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8217821782178217,
                            "precision": 1.0,
                            "recall": 0.8217821782178217,
                            "f1": 0.9021739130434783,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1782178217821782,
                            "true_positive_rate": 0.8217821782178217
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 24,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.2376237623762376,
                            "precision": 1.0,
                            "recall": 0.2376237623762376,
                            "f1": 0.384,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7623762376237624,
                            "true_positive_rate": 0.2376237623762376
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5074257425742574,
                                "stdev": 0.28562039818889373
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5074257425742574,
                                "stdev": 0.28562039818889373
                            },
                            "f1": {
                                "average": 0.623901186862822,
                                "stdev": 0.2604784706819093
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.49257425742574257,
                                "stdev": 0.28562039818889373
                            },
                            "true_positive_rate": {
                                "average": 0.5074257425742574,
                                "stdev": 0.28562039818889373
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 99,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.9801980198019802,
                            "precision": 1.0,
                            "recall": 0.9801980198019802,
                            "f1": 0.99,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.019801980198019802,
                            "true_positive_rate": 0.9801980198019802
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 91,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.900990099009901,
                            "precision": 1.0,
                            "recall": 0.900990099009901,
                            "f1": 0.9479166666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.09900990099009901,
                            "true_positive_rate": 0.900990099009901
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 4,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.039603960396039604,
                            "precision": 1.0,
                            "recall": 0.039603960396039604,
                            "f1": 0.0761904761904762,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9603960396039604,
                            "true_positive_rate": 0.039603960396039604
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7301980198019802,
                                "stdev": 0.40043201599201367
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7301980198019802,
                                "stdev": 0.40043201599201367
                            },
                            "f1": {
                                "average": 0.7535267857142858,
                                "stdev": 0.3915483777424457
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2698019801980198,
                                "stdev": 0.40043201599201367
                            },
                            "true_positive_rate": {
                                "average": 0.7301980198019802,
                                "stdev": 0.40043201599201367
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 68,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.6732673267326733,
                            "precision": 1.0,
                            "recall": 0.6732673267326733,
                            "f1": 0.8047337278106509,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.32673267326732675,
                            "true_positive_rate": 0.6732673267326733
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 61,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.6039603960396039,
                            "precision": 1.0,
                            "recall": 0.6039603960396039,
                            "f1": 0.7530864197530864,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.39603960396039606,
                            "true_positive_rate": 0.6039603960396039
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 46,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.45544554455445546,
                            "precision": 1.0,
                            "recall": 0.45544554455445546,
                            "f1": 0.6258503401360545,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5445544554455446,
                            "true_positive_rate": 0.45544554455445546
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 18,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.1782178217821782,
                            "precision": 1.0,
                            "recall": 0.1782178217821782,
                            "f1": 0.3025210084033613,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8217821782178217,
                            "true_positive_rate": 0.1782178217821782
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4777227722772277,
                                "stdev": 0.18998230225234422
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4777227722772277,
                                "stdev": 0.18998230225234422
                            },
                            "f1": {
                                "average": 0.6215478740257883,
                                "stdev": 0.19535614799798445
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5222772277227723,
                                "stdev": 0.1899823022523442
                            },
                            "true_positive_rate": {
                                "average": 0.4777227722772277,
                                "stdev": 0.18998230225234422
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 95,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.9405940594059405,
                            "precision": 1.0,
                            "recall": 0.9405940594059405,
                            "f1": 0.9693877551020408,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0594059405940594,
                            "true_positive_rate": 0.9405940594059405
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 68,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.6732673267326733,
                            "precision": 1.0,
                            "recall": 0.6732673267326733,
                            "f1": 0.8047337278106509,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.32673267326732675,
                            "true_positive_rate": 0.6732673267326733
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 28,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.27722772277227725,
                            "precision": 1.0,
                            "recall": 0.27722772277227725,
                            "f1": 0.43410852713178294,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7227722772277227,
                            "true_positive_rate": 0.27722772277227725
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 8,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.07920792079207921,
                            "precision": 1.0,
                            "recall": 0.07920792079207921,
                            "f1": 0.14678899082568808,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9207920792079208,
                            "true_positive_rate": 0.07920792079207921
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4925742574257426,
                                "stdev": 0.33564027861355017
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4925742574257426,
                                "stdev": 0.33564027861355017
                            },
                            "f1": {
                                "average": 0.5887547502175406,
                                "stdev": 0.3204596659328113
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5074257425742574,
                                "stdev": 0.33564027861355017
                            },
                            "true_positive_rate": {
                                "average": 0.4925742574257426,
                                "stdev": 0.33564027861355017
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 71,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.7029702970297029,
                            "precision": 1.0,
                            "recall": 0.7029702970297029,
                            "f1": 0.8255813953488372,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.297029702970297,
                            "true_positive_rate": 0.7029702970297029
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 25,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.24752475247524752,
                            "precision": 1.0,
                            "recall": 0.24752475247524752,
                            "f1": 0.3968253968253968,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7524752475247525,
                            "true_positive_rate": 0.24752475247524752
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 53,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.5247524752475248,
                            "precision": 1.0,
                            "recall": 0.5247524752475248,
                            "f1": 0.6883116883116883,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4752475247524752,
                            "true_positive_rate": 0.5247524752475248
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 20,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.19801980198019803,
                            "precision": 1.0,
                            "recall": 0.19801980198019803,
                            "f1": 0.3305785123966942,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.801980198019802,
                            "true_positive_rate": 0.19801980198019803
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4183168316831683,
                                "stdev": 0.20618975312249732
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4183168316831683,
                                "stdev": 0.20618975312249732
                            },
                            "f1": {
                                "average": 0.5603242482206541,
                                "stdev": 0.203873193713948
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5816831683168318,
                                "stdev": 0.20618975312249735
                            },
                            "true_positive_rate": {
                                "average": 0.4183168316831683,
                                "stdev": 0.20618975312249732
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 92,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.9108910891089109,
                            "precision": 1.0,
                            "recall": 0.9108910891089109,
                            "f1": 0.9533678756476683,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0891089108910891,
                            "true_positive_rate": 0.9108910891089109
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 84,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8316831683168316,
                            "precision": 1.0,
                            "recall": 0.8316831683168316,
                            "f1": 0.9081081081081082,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.16831683168316833,
                            "true_positive_rate": 0.8316831683168316
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 68,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.6732673267326733,
                            "precision": 1.0,
                            "recall": 0.6732673267326733,
                            "f1": 0.8047337278106509,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.32673267326732675,
                            "true_positive_rate": 0.6732673267326733
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 36,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.3564356435643564,
                            "precision": 1.0,
                            "recall": 0.3564356435643564,
                            "f1": 0.5255474452554745,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6435643564356436,
                            "true_positive_rate": 0.3564356435643564
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.693069306930693,
                                "stdev": 0.2123525800943289
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.693069306930693,
                                "stdev": 0.2123525800943289
                            },
                            "f1": {
                                "average": 0.7979392892054755,
                                "stdev": 0.1662368003694282
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.306930693069307,
                                "stdev": 0.2123525800943289
                            },
                            "true_positive_rate": {
                                "average": 0.693069306930693,
                                "stdev": 0.2123525800943289
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 83,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8217821782178217,
                            "precision": 1.0,
                            "recall": 0.8217821782178217,
                            "f1": 0.9021739130434783,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1782178217821782,
                            "true_positive_rate": 0.8217821782178217
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 24,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.2376237623762376,
                            "precision": 1.0,
                            "recall": 0.2376237623762376,
                            "f1": 0.384,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7623762376237624,
                            "true_positive_rate": 0.2376237623762376
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 73,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.7227722772277227,
                            "precision": 1.0,
                            "recall": 0.7227722772277227,
                            "f1": 0.8390804597701149,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.27722772277227725,
                            "true_positive_rate": 0.7227722772277227
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 8,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.07920792079207921,
                            "precision": 1.0,
                            "recall": 0.07920792079207921,
                            "f1": 0.14678899082568808,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9207920792079208,
                            "true_positive_rate": 0.07920792079207921
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.46534653465346537,
                                "stdev": 0.3139566335783748
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.46534653465346537,
                                "stdev": 0.3139566335783748
                            },
                            "f1": {
                                "average": 0.5680108409098203,
                                "stdev": 0.31481404575062566
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5346534653465347,
                                "stdev": 0.31395663357837483
                            },
                            "true_positive_rate": {
                                "average": 0.46534653465346537,
                                "stdev": 0.3139566335783748
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 81,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.801980198019802,
                            "precision": 1.0,
                            "recall": 0.801980198019802,
                            "f1": 0.8901098901098901,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.19801980198019803,
                            "true_positive_rate": 0.801980198019802
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 75,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.7425742574257426,
                            "precision": 1.0,
                            "recall": 0.7425742574257426,
                            "f1": 0.8522727272727273,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25742574257425743,
                            "true_positive_rate": 0.7425742574257426
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 91,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.900990099009901,
                            "precision": 1.0,
                            "recall": 0.900990099009901,
                            "f1": 0.9479166666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.09900990099009901,
                            "true_positive_rate": 0.900990099009901
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 70,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.693069306930693,
                            "precision": 1.0,
                            "recall": 0.693069306930693,
                            "f1": 0.8187134502923976,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3069306930693069,
                            "true_positive_rate": 0.693069306930693
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7846534653465347,
                                "stdev": 0.07744795963941595
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7846534653465347,
                                "stdev": 0.07744795963941595
                            },
                            "f1": {
                                "average": 0.8772531835854205,
                                "stdev": 0.04798319058649216
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.21534653465346537,
                                "stdev": 0.0774479596394159
                            },
                            "true_positive_rate": {
                                "average": 0.7846534653465347,
                                "stdev": 0.07744795963941595
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 88,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8712871287128713,
                            "precision": 1.0,
                            "recall": 0.8712871287128713,
                            "f1": 0.9312169312169312,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.12871287128712872,
                            "true_positive_rate": 0.8712871287128713
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 86,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8514851485148515,
                            "precision": 1.0,
                            "recall": 0.8514851485148515,
                            "f1": 0.9197860962566845,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1485148514851485,
                            "true_positive_rate": 0.8514851485148515
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 83,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8217821782178217,
                            "precision": 1.0,
                            "recall": 0.8217821782178217,
                            "f1": 0.9021739130434783,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1782178217821782,
                            "true_positive_rate": 0.8217821782178217
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 76,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.7524752475247525,
                            "precision": 1.0,
                            "recall": 0.7524752475247525,
                            "f1": 0.8587570621468926,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.24752475247524752,
                            "true_positive_rate": 0.7524752475247525
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8242574257425743,
                                "stdev": 0.045033181679852104
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8242574257425743,
                                "stdev": 0.045033181679852104
                            },
                            "f1": {
                                "average": 0.9029835006659968,
                                "stdev": 0.027550349886440957
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.17574257425742573,
                                "stdev": 0.045033181679852104
                            },
                            "true_positive_rate": {
                                "average": 0.8242574257425743,
                                "stdev": 0.045033181679852104
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 101,
                        "prediction_error_num": 91,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.900990099009901,
                            "precision": 1.0,
                            "recall": 0.900990099009901,
                            "f1": 0.9479166666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.09900990099009901,
                            "true_positive_rate": 0.900990099009901
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 101,
                        "prediction_error_num": 90,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8910891089108911,
                            "precision": 1.0,
                            "recall": 0.8910891089108911,
                            "f1": 0.9424083769633508,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.10891089108910891,
                            "true_positive_rate": 0.8910891089108911
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 101,
                        "prediction_error_num": 90,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8910891089108911,
                            "precision": 1.0,
                            "recall": 0.8910891089108911,
                            "f1": 0.9424083769633508,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.10891089108910891,
                            "true_positive_rate": 0.8910891089108911
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 101,
                        "prediction_error_num": 85,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8415841584158416,
                            "precision": 1.0,
                            "recall": 0.8415841584158416,
                            "f1": 0.9139784946236559,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.15841584158415842,
                            "true_positive_rate": 0.8415841584158416
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8811881188118812,
                                "stdev": 0.02321987999912591
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8811881188118812,
                                "stdev": 0.02321987999912591
                            },
                            "f1": {
                                "average": 0.9366779788042561,
                                "stdev": 0.013297082509497384
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.1188118811881188,
                                "stdev": 0.02321987999912589
                            },
                            "true_positive_rate": {
                                "average": 0.8811881188118812,
                                "stdev": 0.02321987999912591
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.6163366336633664,
                            "stdev": 0.15993449613144795
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6163366336633664,
                            "stdev": 0.15993449613144795
                        },
                        "f1": {
                            "average": 0.7107109248848925,
                            "stdev": 0.1394391134837555
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.38366336633663367,
                            "stdev": 0.15993449613144795
                        },
                        "true_positive_rate": {
                            "average": 0.6163366336633664,
                            "stdev": 0.15993449613144795
                        }
                    }
                }
            }
        }
    },
    "finegrained_fact_verification": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 28,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.8235294117647058,
                            "precision": 1.0,
                            "recall": 0.8235294117647058,
                            "f1": 0.9032258064516129,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.17647058823529413,
                            "true_positive_rate": 0.8235294117647058
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 22,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.6470588235294118,
                            "precision": 1.0,
                            "recall": 0.6470588235294118,
                            "f1": 0.7857142857142857,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.35294117647058826,
                            "true_positive_rate": 0.6470588235294118
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 26,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.7647058823529411,
                            "precision": 1.0,
                            "recall": 0.7647058823529411,
                            "f1": 0.8666666666666667,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.23529411764705882,
                            "true_positive_rate": 0.7647058823529411
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 17,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.5,
                            "precision": 1.0,
                            "recall": 0.5,
                            "f1": 0.6666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5,
                            "true_positive_rate": 0.5
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6838235294117647,
                                "stdev": 0.12369561647985823
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6838235294117647,
                                "stdev": 0.12369561647985823
                            },
                            "f1": {
                                "average": 0.805568356374808,
                                "stdev": 0.09077147762595043
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3161764705882353,
                                "stdev": 0.12369561647985826
                            },
                            "true_positive_rate": {
                                "average": 0.6838235294117647,
                                "stdev": 0.12369561647985823
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 17,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.5,
                            "precision": 1.0,
                            "recall": 0.5,
                            "f1": 0.6666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5,
                            "true_positive_rate": 0.5
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 32,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.9411764705882353,
                            "precision": 1.0,
                            "recall": 0.9411764705882353,
                            "f1": 0.9696969696969697,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.058823529411764705,
                            "true_positive_rate": 0.9411764705882353
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 22,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.6470588235294118,
                            "precision": 1.0,
                            "recall": 0.6470588235294118,
                            "f1": 0.7857142857142857,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.35294117647058826,
                            "true_positive_rate": 0.6470588235294118
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7720588235294118,
                                "stdev": 0.20627588439861083
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7720588235294118,
                                "stdev": 0.20627588439861083
                            },
                            "f1": {
                                "average": 0.8555194805194805,
                                "stdev": 0.13642696770264412
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.22794117647058826,
                                "stdev": 0.20627588439861086
                            },
                            "true_positive_rate": {
                                "average": 0.7720588235294118,
                                "stdev": 0.20627588439861083
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 32,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.9411764705882353,
                            "precision": 1.0,
                            "recall": 0.9411764705882353,
                            "f1": 0.9696969696969697,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.058823529411764705,
                            "true_positive_rate": 0.9411764705882353
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 33,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.9705882352941176,
                            "precision": 1.0,
                            "recall": 0.9705882352941176,
                            "f1": 0.9850746268656716,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.029411764705882353,
                            "true_positive_rate": 0.9705882352941176
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 20,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.5882352941176471,
                            "precision": 1.0,
                            "recall": 0.5882352941176471,
                            "f1": 0.7407407407407407,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4117647058823529,
                            "true_positive_rate": 0.5882352941176471
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.875,
                                "stdev": 0.1668647899692679
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.875,
                                "stdev": 0.1668647899692679
                            },
                            "f1": {
                                "average": 0.9238780843258454,
                                "stdev": 0.1062758436061469
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.125,
                                "stdev": 0.1668647899692679
                            },
                            "true_positive_rate": {
                                "average": 0.875,
                                "stdev": 0.1668647899692679
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 27,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.7941176470588235,
                            "precision": 1.0,
                            "recall": 0.7941176470588235,
                            "f1": 0.8852459016393442,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.20588235294117646,
                            "true_positive_rate": 0.7941176470588235
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 23,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.6764705882352942,
                            "precision": 1.0,
                            "recall": 0.6764705882352942,
                            "f1": 0.8070175438596491,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3235294117647059,
                            "true_positive_rate": 0.6764705882352942
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 8,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.23529411764705882,
                            "precision": 1.0,
                            "recall": 0.23529411764705882,
                            "f1": 0.38095238095238093,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7647058823529411,
                            "true_positive_rate": 0.23529411764705882
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 2,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.058823529411764705,
                            "precision": 1.0,
                            "recall": 0.058823529411764705,
                            "f1": 0.1111111111111111,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9411764705882353,
                            "true_positive_rate": 0.058823529411764705
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.44117647058823534,
                                "stdev": 0.30352599177279843
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.44117647058823534,
                                "stdev": 0.30352599177279843
                            },
                            "f1": {
                                "average": 0.5460817343906214,
                                "stdev": 0.3160644615785969
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5588235294117647,
                                "stdev": 0.30352599177279843
                            },
                            "true_positive_rate": {
                                "average": 0.44117647058823534,
                                "stdev": 0.30352599177279843
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 31,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.9117647058823529,
                            "precision": 1.0,
                            "recall": 0.9117647058823529,
                            "f1": 0.9538461538461539,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.08823529411764706,
                            "true_positive_rate": 0.9117647058823529
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 23,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.6764705882352942,
                            "precision": 1.0,
                            "recall": 0.6764705882352942,
                            "f1": 0.8070175438596491,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3235294117647059,
                            "true_positive_rate": 0.6764705882352942
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 9,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.2647058823529412,
                            "precision": 1.0,
                            "recall": 0.2647058823529412,
                            "f1": 0.4186046511627907,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7352941176470589,
                            "true_positive_rate": 0.2647058823529412
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.713235294117647,
                                "stdev": 0.28468324533244027
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.713235294117647,
                                "stdev": 0.28468324533244027
                            },
                            "f1": {
                                "average": 0.7948670872171484,
                                "stdev": 0.228623549702993
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.286764705882353,
                                "stdev": 0.28468324533244027
                            },
                            "true_positive_rate": {
                                "average": 0.713235294117647,
                                "stdev": 0.28468324533244027
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 12,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.35294117647058826,
                            "precision": 1.0,
                            "recall": 0.35294117647058826,
                            "f1": 0.5217391304347826,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6470588235294118,
                            "true_positive_rate": 0.35294117647058826
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 8,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.23529411764705882,
                            "precision": 1.0,
                            "recall": 0.23529411764705882,
                            "f1": 0.38095238095238093,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7647058823529411,
                            "true_positive_rate": 0.23529411764705882
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 5,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.14705882352941177,
                            "precision": 1.0,
                            "recall": 0.14705882352941177,
                            "f1": 0.2564102564102564,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8529411764705882,
                            "true_positive_rate": 0.14705882352941177
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 2,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.058823529411764705,
                            "precision": 1.0,
                            "recall": 0.058823529411764705,
                            "f1": 0.1111111111111111,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9411764705882353,
                            "true_positive_rate": 0.058823529411764705
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1985294117647059,
                                "stdev": 0.1088135925510937
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.1985294117647059,
                                "stdev": 0.1088135925510937
                            },
                            "f1": {
                                "average": 0.3175532197271328,
                                "stdev": 0.1517136661185929
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8014705882352942,
                                "stdev": 0.10881359255109369
                            },
                            "true_positive_rate": {
                                "average": 0.1985294117647059,
                                "stdev": 0.1088135925510937
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 29,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.8529411764705882,
                            "precision": 1.0,
                            "recall": 0.8529411764705882,
                            "f1": 0.9206349206349206,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.14705882352941177,
                            "true_positive_rate": 0.8529411764705882
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 21,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.6176470588235294,
                            "precision": 1.0,
                            "recall": 0.6176470588235294,
                            "f1": 0.7636363636363637,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.38235294117647056,
                            "true_positive_rate": 0.6176470588235294
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 16,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.47058823529411764,
                            "precision": 1.0,
                            "recall": 0.47058823529411764,
                            "f1": 0.64,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5294117647058824,
                            "true_positive_rate": 0.47058823529411764
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 0,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4852941176470589,
                                "stdev": 0.31161206030025135
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.4852941176470589,
                                "stdev": 0.31161206030025135
                            },
                            "f1": {
                                "average": 0.5810678210678211,
                                "stdev": 0.34991066797048853
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5147058823529411,
                                "stdev": 0.31161206030025135
                            },
                            "true_positive_rate": {
                                "average": 0.4852941176470589,
                                "stdev": 0.31161206030025135
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 20,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.5882352941176471,
                            "precision": 1.0,
                            "recall": 0.5882352941176471,
                            "f1": 0.7407407407407407,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4117647058823529,
                            "true_positive_rate": 0.5882352941176471
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 7,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.20588235294117646,
                            "precision": 1.0,
                            "recall": 0.20588235294117646,
                            "f1": 0.34146341463414637,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7941176470588235,
                            "true_positive_rate": 0.20588235294117646
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 20,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.5882352941176471,
                            "precision": 1.0,
                            "recall": 0.5882352941176471,
                            "f1": 0.7407407407407407,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4117647058823529,
                            "true_positive_rate": 0.5882352941176471
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 9,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.2647058823529412,
                            "precision": 1.0,
                            "recall": 0.2647058823529412,
                            "f1": 0.4186046511627907,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7352941176470589,
                            "true_positive_rate": 0.2647058823529412
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4117647058823529,
                                "stdev": 0.17769185255286138
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4117647058823529,
                                "stdev": 0.17769185255286138
                            },
                            "f1": {
                                "average": 0.5603873868196046,
                                "stdev": 0.18240388857491116
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5882352941176471,
                                "stdev": 0.17769185255286135
                            },
                            "true_positive_rate": {
                                "average": 0.4117647058823529,
                                "stdev": 0.17769185255286138
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 25,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.7352941176470589,
                            "precision": 1.0,
                            "recall": 0.7352941176470589,
                            "f1": 0.847457627118644,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2647058823529412,
                            "true_positive_rate": 0.7352941176470589
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 10,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.29411764705882354,
                            "precision": 1.0,
                            "recall": 0.29411764705882354,
                            "f1": 0.45454545454545453,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7058823529411765,
                            "true_positive_rate": 0.29411764705882354
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 27,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.7941176470588235,
                            "precision": 1.0,
                            "recall": 0.7941176470588235,
                            "f1": 0.8852459016393442,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.20588235294117646,
                            "true_positive_rate": 0.7941176470588235
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 9,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.2647058823529412,
                            "precision": 1.0,
                            "recall": 0.2647058823529412,
                            "f1": 0.4186046511627907,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7352941176470589,
                            "true_positive_rate": 0.2647058823529412
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5220588235294118,
                                "stdev": 0.2437585949154526
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5220588235294118,
                                "stdev": 0.2437585949154526
                            },
                            "f1": {
                                "average": 0.6514634086165584,
                                "stdev": 0.2156779249764879
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.47794117647058826,
                                "stdev": 0.24375859491545263
                            },
                            "true_positive_rate": {
                                "average": 0.5220588235294118,
                                "stdev": 0.2437585949154526
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 4,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.11764705882352941,
                            "precision": 1.0,
                            "recall": 0.11764705882352941,
                            "f1": 0.21052631578947367,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8823529411764706,
                            "true_positive_rate": 0.11764705882352941
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 3,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.08823529411764706,
                            "precision": 1.0,
                            "recall": 0.08823529411764706,
                            "f1": 0.16216216216216217,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9117647058823529,
                            "true_positive_rate": 0.08823529411764706
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 3,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.08823529411764706,
                            "precision": 1.0,
                            "recall": 0.08823529411764706,
                            "f1": 0.16216216216216217,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9117647058823529,
                            "true_positive_rate": 0.08823529411764706
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 3,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.08823529411764706,
                            "precision": 1.0,
                            "recall": 0.08823529411764706,
                            "f1": 0.16216216216216217,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9117647058823529,
                            "true_positive_rate": 0.08823529411764706
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.09558823529411765,
                                "stdev": 0.01273566770271233
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.09558823529411765,
                                "stdev": 0.01273566770271233
                            },
                            "f1": {
                                "average": 0.17425320056899005,
                                "stdev": 0.020942292836892536
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9044117647058822,
                                "stdev": 0.012735667702712335
                            },
                            "true_positive_rate": {
                                "average": 0.09558823529411765,
                                "stdev": 0.01273566770271233
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 34,
                        "prediction_error_num": 6,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.17647058823529413,
                            "precision": 1.0,
                            "recall": 0.17647058823529413,
                            "f1": 0.3,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8235294117647058,
                            "true_positive_rate": 0.17647058823529413
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 34,
                        "prediction_error_num": 7,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.20588235294117646,
                            "precision": 1.0,
                            "recall": 0.20588235294117646,
                            "f1": 0.34146341463414637,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7941176470588235,
                            "true_positive_rate": 0.20588235294117646
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 34,
                        "prediction_error_num": 6,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.17647058823529413,
                            "precision": 1.0,
                            "recall": 0.17647058823529413,
                            "f1": 0.3,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8235294117647058,
                            "true_positive_rate": 0.17647058823529413
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 34,
                        "prediction_error_num": 0,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.13970588235294118,
                                "stdev": 0.08154806254712807
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.13970588235294118,
                                "stdev": 0.08154806254712807
                            },
                            "f1": {
                                "average": 0.23536585365853657,
                                "stdev": 0.13693878491863098
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8602941176470589,
                                "stdev": 0.08154806254712808
                            },
                            "true_positive_rate": {
                                "average": 0.13970588235294118,
                                "stdev": 0.08154806254712807
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.4852941176470589,
                            "stdev": 0.250039317449327
                        },
                        "precision": {
                            "average": 0.9545454545454546,
                            "stdev": 0.09642365197998375
                        },
                        "recall": {
                            "average": 0.4852941176470589,
                            "stdev": 0.250039317449327
                        },
                        "f1": {
                            "average": 0.5860005121169588,
                            "stdev": 0.2430377331139947
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5147058823529412,
                            "stdev": 0.250039317449327
                        },
                        "true_positive_rate": {
                            "average": 0.4852941176470589,
                            "stdev": 0.250039317449327
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 71,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.797752808988764,
                            "precision": 1.0,
                            "recall": 0.797752808988764,
                            "f1": 0.8875,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.20224719101123595,
                            "true_positive_rate": 0.797752808988764
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 60,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.6741573033707865,
                            "precision": 1.0,
                            "recall": 0.6741573033707865,
                            "f1": 0.8053691275167785,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3258426966292135,
                            "true_positive_rate": 0.6741573033707865
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 77,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.8651685393258427,
                            "precision": 1.0,
                            "recall": 0.8651685393258427,
                            "f1": 0.927710843373494,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1348314606741573,
                            "true_positive_rate": 0.8651685393258427
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 29,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.3258426966292135,
                            "precision": 1.0,
                            "recall": 0.3258426966292135,
                            "f1": 0.4915254237288136,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6741573033707865,
                            "true_positive_rate": 0.3258426966292135
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6657303370786517,
                                "stdev": 0.207846188018943
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6657303370786517,
                                "stdev": 0.207846188018943
                            },
                            "f1": {
                                "average": 0.7780263486547715,
                                "stdev": 0.17118727744591733
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3342696629213483,
                                "stdev": 0.207846188018943
                            },
                            "true_positive_rate": {
                                "average": 0.6657303370786517,
                                "stdev": 0.207846188018943
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 87,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.9775280898876404,
                            "precision": 1.0,
                            "recall": 0.9775280898876404,
                            "f1": 0.9886363636363636,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.02247191011235955,
                            "true_positive_rate": 0.9775280898876404
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 57,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.6404494382022472,
                            "precision": 1.0,
                            "recall": 0.6404494382022472,
                            "f1": 0.7808219178082192,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3595505617977528,
                            "true_positive_rate": 0.6404494382022472
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 80,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.898876404494382,
                            "precision": 1.0,
                            "recall": 0.898876404494382,
                            "f1": 0.9467455621301775,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.10112359550561797,
                            "true_positive_rate": 0.898876404494382
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 59,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.6629213483146067,
                            "precision": 1.0,
                            "recall": 0.6629213483146067,
                            "f1": 0.7972972972972973,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.33707865168539325,
                            "true_positive_rate": 0.6629213483146067
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7949438202247191,
                                "stdev": 0.14614842179020857
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7949438202247191,
                                "stdev": 0.14614842179020857
                            },
                            "f1": {
                                "average": 0.8783752852180144,
                                "stdev": 0.09072251622018328
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2050561797752809,
                                "stdev": 0.1461484217902086
                            },
                            "true_positive_rate": {
                                "average": 0.7949438202247191,
                                "stdev": 0.14614842179020857
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 88,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.9887640449438202,
                            "precision": 1.0,
                            "recall": 0.9887640449438202,
                            "f1": 0.9943502824858758,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.011235955056179775,
                            "true_positive_rate": 0.9887640449438202
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 77,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.8651685393258427,
                            "precision": 1.0,
                            "recall": 0.8651685393258427,
                            "f1": 0.927710843373494,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1348314606741573,
                            "true_positive_rate": 0.8651685393258427
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 85,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.9550561797752809,
                            "precision": 1.0,
                            "recall": 0.9550561797752809,
                            "f1": 0.9770114942528736,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0449438202247191,
                            "true_positive_rate": 0.9550561797752809
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 56,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.6292134831460674,
                            "precision": 1.0,
                            "recall": 0.6292134831460674,
                            "f1": 0.7724137931034483,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3707865168539326,
                            "true_positive_rate": 0.6292134831460674
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8595505617977528,
                                "stdev": 0.1404494382022472
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8595505617977528,
                                "stdev": 0.1404494382022472
                            },
                            "f1": {
                                "average": 0.9178716033039229,
                                "stdev": 0.08746613668205228
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.1404494382022472,
                                "stdev": 0.1404494382022472
                            },
                            "true_positive_rate": {
                                "average": 0.8595505617977528,
                                "stdev": 0.1404494382022472
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 42,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.47191011235955055,
                            "precision": 1.0,
                            "recall": 0.47191011235955055,
                            "f1": 0.6412213740458015,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5280898876404494,
                            "true_positive_rate": 0.47191011235955055
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 43,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.48314606741573035,
                            "precision": 1.0,
                            "recall": 0.48314606741573035,
                            "f1": 0.6515151515151515,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5168539325842697,
                            "true_positive_rate": 0.48314606741573035
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 10,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.11235955056179775,
                            "precision": 1.0,
                            "recall": 0.11235955056179775,
                            "f1": 0.20202020202020202,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8876404494382022,
                            "true_positive_rate": 0.11235955056179775
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 4,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.0449438202247191,
                            "precision": 1.0,
                            "recall": 0.0449438202247191,
                            "f1": 0.08602150537634409,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9550561797752809,
                            "true_positive_rate": 0.0449438202247191
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2780898876404494,
                                "stdev": 0.20089670838133397
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2780898876404494,
                                "stdev": 0.20089670838133397
                            },
                            "f1": {
                                "average": 0.39519455823937477,
                                "stdev": 0.2545259069085276
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7219101123595506,
                                "stdev": 0.20089670838133394
                            },
                            "true_positive_rate": {
                                "average": 0.2780898876404494,
                                "stdev": 0.20089670838133397
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 87,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.9775280898876404,
                            "precision": 1.0,
                            "recall": 0.9775280898876404,
                            "f1": 0.9886363636363636,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.02247191011235955,
                            "true_positive_rate": 0.9775280898876404
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 86,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.9662921348314607,
                            "precision": 1.0,
                            "recall": 0.9662921348314607,
                            "f1": 0.9828571428571429,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.033707865168539325,
                            "true_positive_rate": 0.9662921348314607
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 43,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.48314606741573035,
                            "precision": 1.0,
                            "recall": 0.48314606741573035,
                            "f1": 0.6515151515151515,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5168539325842697,
                            "true_positive_rate": 0.48314606741573035
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 12,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.1348314606741573,
                            "precision": 1.0,
                            "recall": 0.1348314606741573,
                            "f1": 0.2376237623762376,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8651685393258427,
                            "true_positive_rate": 0.1348314606741573
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6404494382022472,
                                "stdev": 0.3536203366709262
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6404494382022472,
                                "stdev": 0.3536203366709262
                            },
                            "f1": {
                                "average": 0.7151581050962239,
                                "stdev": 0.3076290813340805
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3595505617977528,
                                "stdev": 0.3536203366709262
                            },
                            "true_positive_rate": {
                                "average": 0.6404494382022472,
                                "stdev": 0.3536203366709262
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 26,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.29213483146067415,
                            "precision": 1.0,
                            "recall": 0.29213483146067415,
                            "f1": 0.45217391304347826,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7078651685393258,
                            "true_positive_rate": 0.29213483146067415
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 9,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.10112359550561797,
                            "precision": 1.0,
                            "recall": 0.10112359550561797,
                            "f1": 0.1836734693877551,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.898876404494382,
                            "true_positive_rate": 0.10112359550561797
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 6,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.06741573033707865,
                            "precision": 1.0,
                            "recall": 0.06741573033707865,
                            "f1": 0.12631578947368421,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9325842696629213,
                            "true_positive_rate": 0.06741573033707865
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 0,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.11516853932584269,
                                "stdev": 0.10846480099049338
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.11516853932584269,
                                "stdev": 0.10846480099049338
                            },
                            "f1": {
                                "average": 0.1905407929762294,
                                "stdev": 0.16502249430933993
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8848314606741573,
                                "stdev": 0.1084648009904934
                            },
                            "true_positive_rate": {
                                "average": 0.11516853932584269,
                                "stdev": 0.10846480099049338
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 53,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.5955056179775281,
                            "precision": 1.0,
                            "recall": 0.5955056179775281,
                            "f1": 0.7464788732394366,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4044943820224719,
                            "true_positive_rate": 0.5955056179775281
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 15,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.16853932584269662,
                            "precision": 1.0,
                            "recall": 0.16853932584269662,
                            "f1": 0.28846153846153844,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8314606741573034,
                            "true_positive_rate": 0.16853932584269662
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 14,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.15730337078651685,
                            "precision": 1.0,
                            "recall": 0.15730337078651685,
                            "f1": 0.27184466019417475,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8426966292134831,
                            "true_positive_rate": 0.15730337078651685
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 0,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2303370786516854,
                                "stdev": 0.22110863656353288
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.2303370786516854,
                                "stdev": 0.22110863656353288
                            },
                            "f1": {
                                "average": 0.3266962679737875,
                                "stdev": 0.2680571210621441
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7696629213483146,
                                "stdev": 0.22110863656353288
                            },
                            "true_positive_rate": {
                                "average": 0.2303370786516854,
                                "stdev": 0.22110863656353288
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 52,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.5842696629213483,
                            "precision": 1.0,
                            "recall": 0.5842696629213483,
                            "f1": 0.7375886524822695,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4157303370786517,
                            "true_positive_rate": 0.5842696629213483
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 9,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.10112359550561797,
                            "precision": 1.0,
                            "recall": 0.10112359550561797,
                            "f1": 0.1836734693877551,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.898876404494382,
                            "true_positive_rate": 0.10112359550561797
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 46,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.5168539325842697,
                            "precision": 1.0,
                            "recall": 0.5168539325842697,
                            "f1": 0.6814814814814815,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.48314606741573035,
                            "true_positive_rate": 0.5168539325842697
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 8,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.0898876404494382,
                            "precision": 1.0,
                            "recall": 0.0898876404494382,
                            "f1": 0.16494845360824742,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9101123595505618,
                            "true_positive_rate": 0.0898876404494382
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.32303370786516855,
                                "stdev": 0.22880761047391204
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.32303370786516855,
                                "stdev": 0.22880761047391204
                            },
                            "f1": {
                                "average": 0.44192301423993835,
                                "stdev": 0.26842790629787383
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6769662921348314,
                                "stdev": 0.22880761047391204
                            },
                            "true_positive_rate": {
                                "average": 0.32303370786516855,
                                "stdev": 0.22880761047391204
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 43,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.48314606741573035,
                            "precision": 1.0,
                            "recall": 0.48314606741573035,
                            "f1": 0.6515151515151515,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5168539325842697,
                            "true_positive_rate": 0.48314606741573035
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 15,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.16853932584269662,
                            "precision": 1.0,
                            "recall": 0.16853932584269662,
                            "f1": 0.28846153846153844,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8314606741573034,
                            "true_positive_rate": 0.16853932584269662
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 58,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.651685393258427,
                            "precision": 1.0,
                            "recall": 0.651685393258427,
                            "f1": 0.7891156462585034,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.34831460674157305,
                            "true_positive_rate": 0.651685393258427
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 8,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.0898876404494382,
                            "precision": 1.0,
                            "recall": 0.0898876404494382,
                            "f1": 0.16494845360824742,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9101123595505618,
                            "true_positive_rate": 0.0898876404494382
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.34831460674157305,
                                "stdev": 0.2287558772078914
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.34831460674157305,
                                "stdev": 0.2287558772078914
                            },
                            "f1": {
                                "average": 0.4735101974608602,
                                "stdev": 0.2553164306039884
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.651685393258427,
                                "stdev": 0.22875587720789137
                            },
                            "true_positive_rate": {
                                "average": 0.34831460674157305,
                                "stdev": 0.2287558772078914
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 23,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.25842696629213485,
                            "precision": 1.0,
                            "recall": 0.25842696629213485,
                            "f1": 0.4107142857142857,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7415730337078652,
                            "true_positive_rate": 0.25842696629213485
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 27,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.30337078651685395,
                            "precision": 1.0,
                            "recall": 0.30337078651685395,
                            "f1": 0.46551724137931033,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6966292134831461,
                            "true_positive_rate": 0.30337078651685395
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 20,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.2247191011235955,
                            "precision": 1.0,
                            "recall": 0.2247191011235955,
                            "f1": 0.3669724770642202,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7752808988764045,
                            "true_positive_rate": 0.2247191011235955
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 10,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.11235955056179775,
                            "precision": 1.0,
                            "recall": 0.11235955056179775,
                            "f1": 0.20202020202020202,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8876404494382022,
                            "true_positive_rate": 0.11235955056179775
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2247191011235955,
                                "stdev": 0.07061688252795807
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2247191011235955,
                                "stdev": 0.07061688252795807
                            },
                            "f1": {
                                "average": 0.36130605154450457,
                                "stdev": 0.09836823156818858
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7752808988764044,
                                "stdev": 0.07061688252795803
                            },
                            "true_positive_rate": {
                                "average": 0.2247191011235955,
                                "stdev": 0.07061688252795807
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 89,
                        "prediction_error_num": 54,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.6067415730337079,
                            "precision": 1.0,
                            "recall": 0.6067415730337079,
                            "f1": 0.7552447552447552,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.39325842696629215,
                            "true_positive_rate": 0.6067415730337079
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 89,
                        "prediction_error_num": 60,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.6741573033707865,
                            "precision": 1.0,
                            "recall": 0.6741573033707865,
                            "f1": 0.8053691275167785,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3258426966292135,
                            "true_positive_rate": 0.6741573033707865
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 89,
                        "prediction_error_num": 49,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.550561797752809,
                            "precision": 1.0,
                            "recall": 0.550561797752809,
                            "f1": 0.7101449275362319,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.449438202247191,
                            "true_positive_rate": 0.550561797752809
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 89,
                        "prediction_error_num": 35,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.39325842696629215,
                            "precision": 1.0,
                            "recall": 0.39325842696629215,
                            "f1": 0.5645161290322581,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6067415730337079,
                            "true_positive_rate": 0.39325842696629215
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5561797752808989,
                                "stdev": 0.10374261411583925
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5561797752808989,
                                "stdev": 0.10374261411583925
                            },
                            "f1": {
                                "average": 0.7088187348325059,
                                "stdev": 0.08986427895253339
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4438202247191011,
                                "stdev": 0.10374261411583927
                            },
                            "true_positive_rate": {
                                "average": 0.5561797752808989,
                                "stdev": 0.10374261411583925
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.4578651685393258,
                            "stdev": 0.24257163485749503
                        },
                        "precision": {
                            "average": 0.9545454545454546,
                            "stdev": 0.09642365197998375
                        },
                        "recall": {
                            "average": 0.4578651685393258,
                            "stdev": 0.24257163485749503
                        },
                        "f1": {
                            "average": 0.5624928145036485,
                            "stdev": 0.23396126520380958
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5421348314606742,
                            "stdev": 0.242571634857495
                        },
                        "true_positive_rate": {
                            "average": 0.4578651685393258,
                            "stdev": 0.24257163485749503
                        }
                    }
                }
            }
        },
        "Instruction-Following": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 7,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.875,
                            "precision": 1.0,
                            "recall": 0.875,
                            "f1": 0.9333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.125,
                            "true_positive_rate": 0.875
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 5,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.625,
                            "precision": 1.0,
                            "recall": 0.625,
                            "f1": 0.7692307692307693,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.375,
                            "true_positive_rate": 0.625
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 4,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.5,
                            "precision": 1.0,
                            "recall": 0.5,
                            "f1": 0.6666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5,
                            "true_positive_rate": 0.5
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.53125,
                                "stdev": 0.2706329386826371
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.53125,
                                "stdev": 0.2706329386826371
                            },
                            "f1": {
                                "average": 0.6478632478632479,
                                "stdev": 0.26350858439936764
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.46875,
                                "stdev": 0.2706329386826371
                            },
                            "true_positive_rate": {
                                "average": 0.53125,
                                "stdev": 0.2706329386826371
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 5,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.625,
                            "precision": 1.0,
                            "recall": 0.625,
                            "f1": 0.7692307692307693,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.375,
                            "true_positive_rate": 0.625
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 0,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.65625,
                                "stdev": 0.40864677595693816
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.65625,
                                "stdev": 0.40864677595693816
                            },
                            "f1": {
                                "average": 0.6923076923076923,
                                "stdev": 0.4106568558473581
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.34375,
                                "stdev": 0.40864677595693816
                            },
                            "true_positive_rate": {
                                "average": 0.65625,
                                "stdev": 0.40864677595693816
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 7,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.875,
                            "precision": 1.0,
                            "recall": 0.875,
                            "f1": 0.9333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.125,
                            "true_positive_rate": 0.875
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.75,
                                "stdev": 0.3644344934278313
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.75,
                                "stdev": 0.3644344934278313
                            },
                            "f1": {
                                "average": 0.788888888888889,
                                "stdev": 0.32829526005987014
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.25,
                                "stdev": 0.3644344934278313
                            },
                            "true_positive_rate": {
                                "average": 0.75,
                                "stdev": 0.3644344934278313
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 7,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.875,
                            "precision": 1.0,
                            "recall": 0.875,
                            "f1": 0.9333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.125,
                            "true_positive_rate": 0.875
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 5,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.625,
                            "precision": 1.0,
                            "recall": 0.625,
                            "f1": 0.7692307692307693,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.375,
                            "true_positive_rate": 0.625
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 4,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.5,
                            "precision": 1.0,
                            "recall": 0.5,
                            "f1": 0.6666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5,
                            "true_positive_rate": 0.5
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.53125,
                                "stdev": 0.2706329386826371
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.53125,
                                "stdev": 0.2706329386826371
                            },
                            "f1": {
                                "average": 0.6478632478632479,
                                "stdev": 0.26350858439936764
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.46875,
                                "stdev": 0.2706329386826371
                            },
                            "true_positive_rate": {
                                "average": 0.53125,
                                "stdev": 0.2706329386826371
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 6,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.75,
                            "precision": 1.0,
                            "recall": 0.75,
                            "f1": 0.8571428571428571,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25,
                            "true_positive_rate": 0.75
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 3,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.375,
                            "precision": 1.0,
                            "recall": 0.375,
                            "f1": 0.5454545454545454,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.625,
                            "true_positive_rate": 0.375
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 0,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.53125,
                                "stdev": 0.3788861141556919
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.53125,
                                "stdev": 0.3788861141556919
                            },
                            "f1": {
                                "average": 0.6006493506493507,
                                "stdev": 0.3837629375558884
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.46875,
                                "stdev": 0.3788861141556919
                            },
                            "true_positive_rate": {
                                "average": 0.53125,
                                "stdev": 0.3788861141556919
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.15625,
                                "stdev": 0.05412658773652741
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.15625,
                                "stdev": 0.05412658773652741
                            },
                            "f1": {
                                "average": 0.26666666666666666,
                                "stdev": 0.07698003589195011
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.84375,
                                "stdev": 0.05412658773652741
                            },
                            "true_positive_rate": {
                                "average": 0.15625,
                                "stdev": 0.05412658773652741
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 3,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.375,
                            "precision": 1.0,
                            "recall": 0.375,
                            "f1": 0.5454545454545454,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.625,
                            "true_positive_rate": 0.375
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 0,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1875,
                                "stdev": 0.13975424859373686
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.1875,
                                "stdev": 0.13975424859373686
                            },
                            "f1": {
                                "average": 0.2919191919191919,
                                "stdev": 0.2037376491737545
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8125,
                                "stdev": 0.13975424859373686
                            },
                            "true_positive_rate": {
                                "average": 0.1875,
                                "stdev": 0.13975424859373686
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 7,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.875,
                            "precision": 1.0,
                            "recall": 0.875,
                            "f1": 0.9333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.125,
                            "true_positive_rate": 0.875
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 3,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.375,
                            "precision": 1.0,
                            "recall": 0.375,
                            "f1": 0.5454545454545454,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.625,
                            "true_positive_rate": 0.375
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 5,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.625,
                            "precision": 1.0,
                            "recall": 0.625,
                            "f1": 0.7692307692307693,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.375,
                            "true_positive_rate": 0.625
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5,
                                "stdev": 0.2795084971874737
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5,
                                "stdev": 0.2795084971874737
                            },
                            "f1": {
                                "average": 0.6175602175602176,
                                "stdev": 0.2665557987536506
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5,
                                "stdev": 0.2795084971874737
                            },
                            "true_positive_rate": {
                                "average": 0.5,
                                "stdev": 0.2795084971874737
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 3,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.375,
                            "precision": 1.0,
                            "recall": 0.375,
                            "f1": 0.5454545454545454,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.625,
                            "true_positive_rate": 0.375
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 3,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.375,
                            "precision": 1.0,
                            "recall": 0.375,
                            "f1": 0.5454545454545454,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.625,
                            "true_positive_rate": 0.375
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 4,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.5,
                            "precision": 1.0,
                            "recall": 0.5,
                            "f1": 0.6666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5,
                            "true_positive_rate": 0.5
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.34375,
                                "stdev": 0.13621559198564606
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.34375,
                                "stdev": 0.13621559198564606
                            },
                            "f1": {
                                "average": 0.4949494949494949,
                                "stdev": 0.1650518650340221
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.65625,
                                "stdev": 0.13621559198564606
                            },
                            "true_positive_rate": {
                                "average": 0.34375,
                                "stdev": 0.13621559198564606
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 0,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 0,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 0,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.03125,
                                "stdev": 0.05412658773652741
                            },
                            "precision": {
                                "average": 0.25,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.03125,
                                "stdev": 0.05412658773652741
                            },
                            "f1": {
                                "average": 0.05555555555555555,
                                "stdev": 0.09622504486493763
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.96875,
                                "stdev": 0.05412658773652741
                            },
                            "true_positive_rate": {
                                "average": 0.03125,
                                "stdev": 0.05412658773652741
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1875,
                                "stdev": 0.0625
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.1875,
                                "stdev": 0.0625
                            },
                            "f1": {
                                "average": 0.3111111111111111,
                                "stdev": 0.0888888888888889
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8125,
                                "stdev": 0.0625
                            },
                            "true_positive_rate": {
                                "average": 0.1875,
                                "stdev": 0.0625
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.4005681818181818,
                            "stdev": 0.22129934197898235
                        },
                        "precision": {
                            "average": 0.8636363636363636,
                            "stdev": 0.22268088570756164
                        },
                        "recall": {
                            "average": 0.4005681818181818,
                            "stdev": 0.22129934197898235
                        },
                        "f1": {
                            "average": 0.4923031513940605,
                            "stdev": 0.2170970897878384
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5994318181818182,
                            "stdev": 0.22129934197898235
                        },
                        "true_positive_rate": {
                            "average": 0.4005681818181818,
                            "stdev": 0.22129934197898235
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 56,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.7887323943661971,
                            "precision": 1.0,
                            "recall": 0.7887323943661971,
                            "f1": 0.8818897637795275,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2112676056338028,
                            "true_positive_rate": 0.7887323943661971
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 43,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.6056338028169014,
                            "precision": 1.0,
                            "recall": 0.6056338028169014,
                            "f1": 0.7543859649122807,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.39436619718309857,
                            "true_positive_rate": 0.6056338028169014
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 57,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.8028169014084507,
                            "precision": 1.0,
                            "recall": 0.8028169014084507,
                            "f1": 0.890625,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.19718309859154928,
                            "true_positive_rate": 0.8028169014084507
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 23,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.323943661971831,
                            "precision": 1.0,
                            "recall": 0.323943661971831,
                            "f1": 0.48936170212765956,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.676056338028169,
                            "true_positive_rate": 0.323943661971831
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.630281690140845,
                                "stdev": 0.19321330870871758
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.630281690140845,
                                "stdev": 0.19321330870871758
                            },
                            "f1": {
                                "average": 0.754065607704867,
                                "stdev": 0.16206151822439427
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3697183098591549,
                                "stdev": 0.19321330870871758
                            },
                            "true_positive_rate": {
                                "average": 0.630281690140845,
                                "stdev": 0.19321330870871758
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 68,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.9577464788732394,
                            "precision": 1.0,
                            "recall": 0.9577464788732394,
                            "f1": 0.9784172661870504,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.04225352112676056,
                            "true_positive_rate": 0.9577464788732394
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 45,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.6338028169014085,
                            "precision": 1.0,
                            "recall": 0.6338028169014085,
                            "f1": 0.7758620689655172,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.36619718309859156,
                            "true_positive_rate": 0.6338028169014085
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 62,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.8732394366197183,
                            "precision": 1.0,
                            "recall": 0.8732394366197183,
                            "f1": 0.9323308270676691,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1267605633802817,
                            "true_positive_rate": 0.8732394366197183
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 44,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.6197183098591549,
                            "precision": 1.0,
                            "recall": 0.6197183098591549,
                            "f1": 0.7652173913043478,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.38028169014084506,
                            "true_positive_rate": 0.6197183098591549
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7711267605633803,
                                "stdev": 0.14750957792713668
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7711267605633803,
                                "stdev": 0.14750957792713668
                            },
                            "f1": {
                                "average": 0.8629568883811461,
                                "stdev": 0.0939179949075656
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2288732394366197,
                                "stdev": 0.1475095779271367
                            },
                            "true_positive_rate": {
                                "average": 0.7711267605633803,
                                "stdev": 0.14750957792713668
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 70,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.9859154929577465,
                            "precision": 1.0,
                            "recall": 0.9859154929577465,
                            "f1": 0.9929078014184397,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.014084507042253521,
                            "true_positive_rate": 0.9859154929577465
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 57,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.8028169014084507,
                            "precision": 1.0,
                            "recall": 0.8028169014084507,
                            "f1": 0.890625,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.19718309859154928,
                            "true_positive_rate": 0.8028169014084507
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 66,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.9295774647887324,
                            "precision": 1.0,
                            "recall": 0.9295774647887324,
                            "f1": 0.9635036496350365,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.07042253521126761,
                            "true_positive_rate": 0.9295774647887324
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 43,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.6056338028169014,
                            "precision": 1.0,
                            "recall": 0.6056338028169014,
                            "f1": 0.7543859649122807,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.39436619718309857,
                            "true_positive_rate": 0.6056338028169014
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8309859154929577,
                                "stdev": 0.14603127713611072
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8309859154929577,
                                "stdev": 0.14603127713611072
                            },
                            "f1": {
                                "average": 0.9003556039914393,
                                "stdev": 0.09213496228917438
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.16901408450704225,
                                "stdev": 0.1460312771361107
                            },
                            "true_positive_rate": {
                                "average": 0.8309859154929577,
                                "stdev": 0.14603127713611072
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 32,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.4507042253521127,
                            "precision": 1.0,
                            "recall": 0.4507042253521127,
                            "f1": 0.6213592233009708,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5492957746478874,
                            "true_positive_rate": 0.4507042253521127
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 35,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.49295774647887325,
                            "precision": 1.0,
                            "recall": 0.49295774647887325,
                            "f1": 0.660377358490566,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5070422535211268,
                            "true_positive_rate": 0.49295774647887325
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 7,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.09859154929577464,
                            "precision": 1.0,
                            "recall": 0.09859154929577464,
                            "f1": 0.1794871794871795,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9014084507042254,
                            "true_positive_rate": 0.09859154929577464
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 1,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.014084507042253521,
                            "precision": 1.0,
                            "recall": 0.014084507042253521,
                            "f1": 0.027777777777777776,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9859154929577465,
                            "true_positive_rate": 0.014084507042253521
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.26408450704225356,
                                "stdev": 0.2104149460327981
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.26408450704225356,
                                "stdev": 0.2104149460327981
                            },
                            "f1": {
                                "average": 0.3722503847641235,
                                "stdev": 0.2742678420351837
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7359154929577465,
                                "stdev": 0.21041494603279806
                            },
                            "true_positive_rate": {
                                "average": 0.26408450704225356,
                                "stdev": 0.2104149460327981
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 67,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.9436619718309859,
                            "precision": 1.0,
                            "recall": 0.9436619718309859,
                            "f1": 0.9710144927536232,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.056338028169014086,
                            "true_positive_rate": 0.9436619718309859
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 68,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.9577464788732394,
                            "precision": 1.0,
                            "recall": 0.9577464788732394,
                            "f1": 0.9784172661870504,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.04225352112676056,
                            "true_positive_rate": 0.9577464788732394
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 32,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.4507042253521127,
                            "precision": 1.0,
                            "recall": 0.4507042253521127,
                            "f1": 0.6213592233009708,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5492957746478874,
                            "true_positive_rate": 0.4507042253521127
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 13,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.18309859154929578,
                            "precision": 1.0,
                            "recall": 0.18309859154929578,
                            "f1": 0.30952380952380953,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8169014084507042,
                            "true_positive_rate": 0.18309859154929578
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6338028169014085,
                                "stdev": 0.3307610863610529
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6338028169014085,
                                "stdev": 0.3307610863610529
                            },
                            "f1": {
                                "average": 0.7200786979413634,
                                "stdev": 0.2774925404574496
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.36619718309859156,
                                "stdev": 0.33076108636105295
                            },
                            "true_positive_rate": {
                                "average": 0.6338028169014085,
                                "stdev": 0.3307610863610529
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 25,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.352112676056338,
                            "precision": 1.0,
                            "recall": 0.352112676056338,
                            "f1": 0.5208333333333334,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.647887323943662,
                            "true_positive_rate": 0.352112676056338
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 10,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.14084507042253522,
                            "precision": 1.0,
                            "recall": 0.14084507042253522,
                            "f1": 0.24691358024691357,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8591549295774648,
                            "true_positive_rate": 0.14084507042253522
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 7,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.09859154929577464,
                            "precision": 1.0,
                            "recall": 0.09859154929577464,
                            "f1": 0.1794871794871795,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9014084507042254,
                            "true_positive_rate": 0.09859154929577464
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 0,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.14788732394366197,
                                "stdev": 0.12850906754151167
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.14788732394366197,
                                "stdev": 0.12850906754151167
                            },
                            "f1": {
                                "average": 0.2368085232668566,
                                "stdev": 0.18717386172647457
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.852112676056338,
                                "stdev": 0.12850906754151167
                            },
                            "true_positive_rate": {
                                "average": 0.14788732394366197,
                                "stdev": 0.12850906754151167
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 47,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.6619718309859155,
                            "precision": 1.0,
                            "recall": 0.6619718309859155,
                            "f1": 0.7966101694915254,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3380281690140845,
                            "true_positive_rate": 0.6619718309859155
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 21,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.29577464788732394,
                            "precision": 1.0,
                            "recall": 0.29577464788732394,
                            "f1": 0.45652173913043476,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.704225352112676,
                            "true_positive_rate": 0.29577464788732394
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 13,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.18309859154929578,
                            "precision": 1.0,
                            "recall": 0.18309859154929578,
                            "f1": 0.30952380952380953,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8169014084507042,
                            "true_positive_rate": 0.18309859154929578
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 1,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.014084507042253521,
                            "precision": 1.0,
                            "recall": 0.014084507042253521,
                            "f1": 0.027777777777777776,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9859154929577465,
                            "true_positive_rate": 0.014084507042253521
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2887323943661972,
                                "stdev": 0.23766953564346768
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2887323943661972,
                                "stdev": 0.23766953564346768
                            },
                            "f1": {
                                "average": 0.39760837398088683,
                                "stdev": 0.2771311863529155
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7112676056338028,
                                "stdev": 0.23766953564346768
                            },
                            "true_positive_rate": {
                                "average": 0.2887323943661972,
                                "stdev": 0.23766953564346768
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 42,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.5915492957746479,
                            "precision": 1.0,
                            "recall": 0.5915492957746479,
                            "f1": 0.7433628318584071,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4084507042253521,
                            "true_positive_rate": 0.5915492957746479
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 10,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.14084507042253522,
                            "precision": 1.0,
                            "recall": 0.14084507042253522,
                            "f1": 0.24691358024691357,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8591549295774648,
                            "true_positive_rate": 0.14084507042253522
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 37,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.5211267605633803,
                            "precision": 1.0,
                            "recall": 0.5211267605633803,
                            "f1": 0.6851851851851852,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4788732394366197,
                            "true_positive_rate": 0.5211267605633803
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 5,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.07042253521126761,
                            "precision": 1.0,
                            "recall": 0.07042253521126761,
                            "f1": 0.13157894736842105,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9295774647887324,
                            "true_positive_rate": 0.07042253521126761
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.33098591549295775,
                                "stdev": 0.22808640479861472
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.33098591549295775,
                                "stdev": 0.22808640479861472
                            },
                            "f1": {
                                "average": 0.4517601361647317,
                                "stdev": 0.2664570747683526
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6690140845070423,
                                "stdev": 0.22808640479861472
                            },
                            "true_positive_rate": {
                                "average": 0.33098591549295775,
                                "stdev": 0.22808640479861472
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 34,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.4788732394366197,
                            "precision": 1.0,
                            "recall": 0.4788732394366197,
                            "f1": 0.6476190476190476,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5211267605633803,
                            "true_positive_rate": 0.4788732394366197
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 13,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.18309859154929578,
                            "precision": 1.0,
                            "recall": 0.18309859154929578,
                            "f1": 0.30952380952380953,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8169014084507042,
                            "true_positive_rate": 0.18309859154929578
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 44,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.6197183098591549,
                            "precision": 1.0,
                            "recall": 0.6197183098591549,
                            "f1": 0.7652173913043478,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.38028169014084506,
                            "true_positive_rate": 0.6197183098591549
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 7,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.09859154929577464,
                            "precision": 1.0,
                            "recall": 0.09859154929577464,
                            "f1": 0.1794871794871795,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9014084507042254,
                            "true_positive_rate": 0.09859154929577464
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3450704225352113,
                                "stdev": 0.21232131593917372
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3450704225352113,
                                "stdev": 0.21232131593917372
                            },
                            "f1": {
                                "average": 0.47546185698359617,
                                "stdev": 0.23913009705538413
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6549295774647887,
                                "stdev": 0.21232131593917375
                            },
                            "true_positive_rate": {
                                "average": 0.3450704225352113,
                                "stdev": 0.21232131593917372
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 26,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.36619718309859156,
                            "precision": 1.0,
                            "recall": 0.36619718309859156,
                            "f1": 0.5360824742268041,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6338028169014085,
                            "true_positive_rate": 0.36619718309859156
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 27,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.38028169014084506,
                            "precision": 1.0,
                            "recall": 0.38028169014084506,
                            "f1": 0.5510204081632653,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6197183098591549,
                            "true_positive_rate": 0.38028169014084506
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 21,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.29577464788732394,
                            "precision": 1.0,
                            "recall": 0.29577464788732394,
                            "f1": 0.45652173913043476,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.704225352112676,
                            "true_positive_rate": 0.29577464788732394
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 10,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.14084507042253522,
                            "precision": 1.0,
                            "recall": 0.14084507042253522,
                            "f1": 0.24691358024691357,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8591549295774648,
                            "true_positive_rate": 0.14084507042253522
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.29577464788732394,
                                "stdev": 0.09500519410726789
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.29577464788732394,
                                "stdev": 0.09500519410726789
                            },
                            "f1": {
                                "average": 0.4476345504418544,
                                "stdev": 0.12132555460149856
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.704225352112676,
                                "stdev": 0.09500519410726789
                            },
                            "true_positive_rate": {
                                "average": 0.29577464788732394,
                                "stdev": 0.09500519410726789
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 71,
                        "prediction_error_num": 41,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.5774647887323944,
                            "precision": 1.0,
                            "recall": 0.5774647887323944,
                            "f1": 0.7321428571428571,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4225352112676056,
                            "true_positive_rate": 0.5774647887323944
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 71,
                        "prediction_error_num": 45,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.6338028169014085,
                            "precision": 1.0,
                            "recall": 0.6338028169014085,
                            "f1": 0.7758620689655172,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.36619718309859156,
                            "true_positive_rate": 0.6338028169014085
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 71,
                        "prediction_error_num": 36,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.5070422535211268,
                            "precision": 1.0,
                            "recall": 0.5070422535211268,
                            "f1": 0.6728971962616822,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.49295774647887325,
                            "true_positive_rate": 0.5070422535211268
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 71,
                        "prediction_error_num": 29,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.4084507042253521,
                            "precision": 1.0,
                            "recall": 0.4084507042253521,
                            "f1": 0.58,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5915492957746479,
                            "true_positive_rate": 0.4084507042253521
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5316901408450705,
                                "stdev": 0.08413945877006002
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5316901408450705,
                                "stdev": 0.08413945877006002
                            },
                            "f1": {
                                "average": 0.6902255305925141,
                                "stdev": 0.07338361147886033
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4683098591549296,
                                "stdev": 0.08413945877006
                            },
                            "true_positive_rate": {
                                "average": 0.5316901408450705,
                                "stdev": 0.08413945877006002
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.46094750320102434,
                            "stdev": 0.21757266047152396
                        },
                        "precision": {
                            "average": 0.9772727272727273,
                            "stdev": 0.07186994682200862
                        },
                        "recall": {
                            "average": 0.46094750320102434,
                            "stdev": 0.21757266047152396
                        },
                        "f1": {
                            "average": 0.57356419583758,
                            "stdev": 0.20963356323790966
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5390524967989757,
                            "stdev": 0.21757266047152396
                        },
                        "true_positive_rate": {
                            "average": 0.46094750320102434,
                            "stdev": 0.21757266047152396
                        }
                    }
                }
            }
        },
        "Context-Faithfulness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 39,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.75,
                            "precision": 1.0,
                            "recall": 0.75,
                            "f1": 0.8571428571428571,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25,
                            "true_positive_rate": 0.75
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 32,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.6153846153846154,
                            "precision": 1.0,
                            "recall": 0.6153846153846154,
                            "f1": 0.7619047619047619,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.38461538461538464,
                            "true_positive_rate": 0.6153846153846154
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 32,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.6153846153846154,
                            "precision": 1.0,
                            "recall": 0.6153846153846154,
                            "f1": 0.7619047619047619,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.38461538461538464,
                            "true_positive_rate": 0.6153846153846154
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 20,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.38461538461538464,
                            "precision": 1.0,
                            "recall": 0.38461538461538464,
                            "f1": 0.5555555555555556,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6153846153846154,
                            "true_positive_rate": 0.38461538461538464
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5913461538461539,
                                "stdev": 0.13140048431458123
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5913461538461539,
                                "stdev": 0.13140048431458123
                            },
                            "f1": {
                                "average": 0.7341269841269842,
                                "stdev": 0.110186058546943
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.40865384615384615,
                                "stdev": 0.13140048431458123
                            },
                            "true_positive_rate": {
                                "average": 0.5913461538461539,
                                "stdev": 0.13140048431458123
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 50,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.9615384615384616,
                            "precision": 1.0,
                            "recall": 0.9615384615384616,
                            "f1": 0.9803921568627451,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.038461538461538464,
                            "true_positive_rate": 0.9615384615384616
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 28,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.5384615384615384,
                            "precision": 1.0,
                            "recall": 0.5384615384615384,
                            "f1": 0.7,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.46153846153846156,
                            "true_positive_rate": 0.5384615384615384
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 47,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.9038461538461539,
                            "precision": 1.0,
                            "recall": 0.9038461538461539,
                            "f1": 0.9494949494949495,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.09615384615384616,
                            "true_positive_rate": 0.9038461538461539
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 14,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.2692307692307692,
                            "precision": 1.0,
                            "recall": 0.2692307692307692,
                            "f1": 0.42424242424242425,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7307692307692307,
                            "true_positive_rate": 0.2692307692307692
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6682692307692307,
                                "stdev": 0.2817734281171655
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6682692307692307,
                                "stdev": 0.2817734281171655
                            },
                            "f1": {
                                "average": 0.7635323826500297,
                                "stdev": 0.2240336344505918
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3317307692307692,
                                "stdev": 0.28177342811716544
                            },
                            "true_positive_rate": {
                                "average": 0.6682692307692307,
                                "stdev": 0.2817734281171655
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 46,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.8846153846153846,
                            "precision": 1.0,
                            "recall": 0.8846153846153846,
                            "f1": 0.9387755102040817,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.11538461538461539,
                            "true_positive_rate": 0.8846153846153846
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 18,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.34615384615384615,
                            "precision": 1.0,
                            "recall": 0.34615384615384615,
                            "f1": 0.5142857142857142,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6538461538461539,
                            "true_positive_rate": 0.34615384615384615
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8076923076923077,
                                "stdev": 0.2706009092205825
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8076923076923077,
                                "stdev": 0.2706009092205825
                            },
                            "f1": {
                                "average": 0.863265306122449,
                                "stdev": 0.20302789183912576
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.19230769230769232,
                                "stdev": 0.2706009092205825
                            },
                            "true_positive_rate": {
                                "average": 0.8076923076923077,
                                "stdev": 0.2706009092205825
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 43,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.8269230769230769,
                            "precision": 1.0,
                            "recall": 0.8269230769230769,
                            "f1": 0.9052631578947369,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.17307692307692307,
                            "true_positive_rate": 0.8269230769230769
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 34,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.6538461538461539,
                            "precision": 1.0,
                            "recall": 0.6538461538461539,
                            "f1": 0.7906976744186046,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.34615384615384615,
                            "true_positive_rate": 0.6538461538461539
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 7,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.1346153846153846,
                            "precision": 1.0,
                            "recall": 0.1346153846153846,
                            "f1": 0.23728813559322035,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8653846153846154,
                            "true_positive_rate": 0.1346153846153846
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 3,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.057692307692307696,
                            "precision": 1.0,
                            "recall": 0.057692307692307696,
                            "f1": 0.10909090909090909,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9423076923076923,
                            "true_positive_rate": 0.057692307692307696
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4182692307692308,
                                "stdev": 0.3290021559793389
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4182692307692308,
                                "stdev": 0.3290021559793389
                            },
                            "f1": {
                                "average": 0.5105849692493677,
                                "stdev": 0.3428274492432809
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5817307692307692,
                                "stdev": 0.3290021559793389
                            },
                            "true_positive_rate": {
                                "average": 0.4182692307692308,
                                "stdev": 0.3290021559793389
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 41,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.7884615384615384,
                            "precision": 1.0,
                            "recall": 0.7884615384615384,
                            "f1": 0.8817204301075269,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.21153846153846154,
                            "true_positive_rate": 0.7884615384615384
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 44,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.8461538461538461,
                            "precision": 1.0,
                            "recall": 0.8461538461538461,
                            "f1": 0.9166666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.15384615384615385,
                            "true_positive_rate": 0.8461538461538461
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 24,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.46153846153846156,
                            "precision": 1.0,
                            "recall": 0.46153846153846156,
                            "f1": 0.631578947368421,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5384615384615384,
                            "true_positive_rate": 0.46153846153846156
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 11,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.21153846153846154,
                            "precision": 1.0,
                            "recall": 0.21153846153846154,
                            "f1": 0.3492063492063492,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7884615384615384,
                            "true_positive_rate": 0.21153846153846154
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.576923076923077,
                                "stdev": 0.2569305618439882
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.576923076923077,
                                "stdev": 0.2569305618439882
                            },
                            "f1": {
                                "average": 0.6947930983372409,
                                "stdev": 0.22781348607114854
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4230769230769231,
                                "stdev": 0.2569305618439882
                            },
                            "true_positive_rate": {
                                "average": 0.576923076923077,
                                "stdev": 0.2569305618439882
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 20,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.38461538461538464,
                            "precision": 1.0,
                            "recall": 0.38461538461538464,
                            "f1": 0.5555555555555556,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6153846153846154,
                            "true_positive_rate": 0.38461538461538464
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 10,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.19230769230769232,
                            "precision": 1.0,
                            "recall": 0.19230769230769232,
                            "f1": 0.3225806451612903,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8076923076923077,
                            "true_positive_rate": 0.19230769230769232
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 6,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.11538461538461539,
                            "precision": 1.0,
                            "recall": 0.11538461538461539,
                            "f1": 0.20689655172413793,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8846153846153846,
                            "true_positive_rate": 0.11538461538461539
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 2,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.038461538461538464,
                            "precision": 1.0,
                            "recall": 0.038461538461538464,
                            "f1": 0.07407407407407407,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9615384615384616,
                            "true_positive_rate": 0.038461538461538464
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1826923076923077,
                                "stdev": 0.1286450784640351
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.1826923076923077,
                                "stdev": 0.1286450784640351
                            },
                            "f1": {
                                "average": 0.2897767066287644,
                                "stdev": 0.17685533684917062
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8173076923076923,
                                "stdev": 0.1286450784640351
                            },
                            "true_positive_rate": {
                                "average": 0.1826923076923077,
                                "stdev": 0.1286450784640351
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 32,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.6153846153846154,
                            "precision": 1.0,
                            "recall": 0.6153846153846154,
                            "f1": 0.7619047619047619,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.38461538461538464,
                            "true_positive_rate": 0.6153846153846154
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 19,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.36538461538461536,
                            "precision": 1.0,
                            "recall": 0.36538461538461536,
                            "f1": 0.5352112676056338,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6346153846153846,
                            "true_positive_rate": 0.36538461538461536
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 18,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.34615384615384615,
                            "precision": 1.0,
                            "recall": 0.34615384615384615,
                            "f1": 0.5142857142857142,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6538461538461539,
                            "true_positive_rate": 0.34615384615384615
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 0,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3317307692307693,
                                "stdev": 0.21900080719096873
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.3317307692307693,
                                "stdev": 0.21900080719096873
                            },
                            "f1": {
                                "average": 0.45285043594902746,
                                "stdev": 0.2789021753889039
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6682692307692307,
                                "stdev": 0.2190008071909687
                            },
                            "true_positive_rate": {
                                "average": 0.3317307692307693,
                                "stdev": 0.21900080719096873
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 32,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.6153846153846154,
                            "precision": 1.0,
                            "recall": 0.6153846153846154,
                            "f1": 0.7619047619047619,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.38461538461538464,
                            "true_positive_rate": 0.6153846153846154
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 14,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.2692307692307692,
                            "precision": 1.0,
                            "recall": 0.2692307692307692,
                            "f1": 0.42424242424242425,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7307692307692307,
                            "true_positive_rate": 0.2692307692307692
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 28,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.5384615384615384,
                            "precision": 1.0,
                            "recall": 0.5384615384615384,
                            "f1": 0.7,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.46153846153846156,
                            "true_positive_rate": 0.5384615384615384
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 8,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.15384615384615385,
                            "precision": 1.0,
                            "recall": 0.15384615384615385,
                            "f1": 0.26666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8461538461538461,
                            "true_positive_rate": 0.15384615384615385
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.39423076923076916,
                                "stdev": 0.18915688050871154
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.39423076923076916,
                                "stdev": 0.18915688050871154
                            },
                            "f1": {
                                "average": 0.5382034632034632,
                                "stdev": 0.20182897502250874
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6057692307692308,
                                "stdev": 0.18915688050871154
                            },
                            "true_positive_rate": {
                                "average": 0.39423076923076916,
                                "stdev": 0.18915688050871154
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 25,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.4807692307692308,
                            "precision": 1.0,
                            "recall": 0.4807692307692308,
                            "f1": 0.6493506493506493,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5192307692307693,
                            "true_positive_rate": 0.4807692307692308
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 11,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.21153846153846154,
                            "precision": 1.0,
                            "recall": 0.21153846153846154,
                            "f1": 0.3492063492063492,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7884615384615384,
                            "true_positive_rate": 0.21153846153846154
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 24,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.46153846153846156,
                            "precision": 1.0,
                            "recall": 0.46153846153846156,
                            "f1": 0.631578947368421,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5384615384615384,
                            "true_positive_rate": 0.46153846153846156
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 9,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.17307692307692307,
                            "precision": 1.0,
                            "recall": 0.17307692307692307,
                            "f1": 0.29508196721311475,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8269230769230769,
                            "true_positive_rate": 0.17307692307692307
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3317307692307692,
                                "stdev": 0.14024953986360406
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3317307692307692,
                                "stdev": 0.14024953986360406
                            },
                            "f1": {
                                "average": 0.4813044782846335,
                                "stdev": 0.16042963489029066
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6682692307692308,
                                "stdev": 0.14024953986360403
                            },
                            "true_positive_rate": {
                                "average": 0.3317307692307692,
                                "stdev": 0.14024953986360406
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 5,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.09615384615384616,
                            "precision": 1.0,
                            "recall": 0.09615384615384616,
                            "f1": 0.17543859649122806,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9038461538461539,
                            "true_positive_rate": 0.09615384615384616
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 5,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.09615384615384616,
                            "precision": 1.0,
                            "recall": 0.09615384615384616,
                            "f1": 0.17543859649122806,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9038461538461539,
                            "true_positive_rate": 0.09615384615384616
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 3,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.057692307692307696,
                            "precision": 1.0,
                            "recall": 0.057692307692307696,
                            "f1": 0.10909090909090909,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9423076923076923,
                            "true_positive_rate": 0.057692307692307696
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 3,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.057692307692307696,
                            "precision": 1.0,
                            "recall": 0.057692307692307696,
                            "f1": 0.10909090909090909,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9423076923076923,
                            "true_positive_rate": 0.057692307692307696
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.07692307692307693,
                                "stdev": 0.019230769230769232
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.07692307692307693,
                                "stdev": 0.019230769230769232
                            },
                            "f1": {
                                "average": 0.14226475279106857,
                                "stdev": 0.03317384370015949
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9230769230769231,
                                "stdev": 0.019230769230769218
                            },
                            "true_positive_rate": {
                                "average": 0.07692307692307693,
                                "stdev": 0.019230769230769232
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 52,
                        "prediction_error_num": 8,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.15384615384615385,
                            "precision": 1.0,
                            "recall": 0.15384615384615385,
                            "f1": 0.26666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8461538461538461,
                            "true_positive_rate": 0.15384615384615385
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 52,
                        "prediction_error_num": 6,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.11538461538461539,
                            "precision": 1.0,
                            "recall": 0.11538461538461539,
                            "f1": 0.20689655172413793,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8846153846153846,
                            "true_positive_rate": 0.11538461538461539
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 52,
                        "prediction_error_num": 7,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.1346153846153846,
                            "precision": 1.0,
                            "recall": 0.1346153846153846,
                            "f1": 0.23728813559322035,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8653846153846154,
                            "true_positive_rate": 0.1346153846153846
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 52,
                        "prediction_error_num": 3,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.057692307692307696,
                            "precision": 1.0,
                            "recall": 0.057692307692307696,
                            "f1": 0.10909090909090909,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9423076923076923,
                            "true_positive_rate": 0.057692307692307696
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.11538461538461539,
                                "stdev": 0.03597747487282636
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.11538461538461539,
                                "stdev": 0.03597747487282636
                            },
                            "f1": {
                                "average": 0.20498556576873353,
                                "stdev": 0.05926097213159439
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8846153846153846,
                                "stdev": 0.03597747487282635
                            },
                            "true_positive_rate": {
                                "average": 0.11538461538461539,
                                "stdev": 0.03597747487282636
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.40865384615384615,
                            "stdev": 0.22311196071596867
                        },
                        "precision": {
                            "average": 0.9772727272727273,
                            "stdev": 0.07186994682200862
                        },
                        "recall": {
                            "average": 0.40865384615384615,
                            "stdev": 0.22311196071596867
                        },
                        "f1": {
                            "average": 0.5159716493737967,
                            "stdev": 0.22456235668642563
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.591346153846154,
                            "stdev": 0.22311196071596867
                        },
                        "true_positive_rate": {
                            "average": 0.40865384615384615,
                            "stdev": 0.22311196071596867
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 59,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.8082191780821918,
                            "precision": 1.0,
                            "recall": 0.8082191780821918,
                            "f1": 0.8939393939393939,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1917808219178082,
                            "true_positive_rate": 0.8082191780821918
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 37,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.5068493150684932,
                            "precision": 1.0,
                            "recall": 0.5068493150684932,
                            "f1": 0.6727272727272727,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4931506849315068,
                            "true_positive_rate": 0.5068493150684932
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 53,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.726027397260274,
                            "precision": 1.0,
                            "recall": 0.726027397260274,
                            "f1": 0.8412698412698413,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.273972602739726,
                            "true_positive_rate": 0.726027397260274
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 17,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.2328767123287671,
                            "precision": 1.0,
                            "recall": 0.2328767123287671,
                            "f1": 0.37777777777777777,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7671232876712328,
                            "true_positive_rate": 0.2328767123287671
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5684931506849314,
                                "stdev": 0.2228923300722326
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5684931506849314,
                                "stdev": 0.2228923300722326
                            },
                            "f1": {
                                "average": 0.6964285714285714,
                                "stdev": 0.20130176248144355
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.43150684931506844,
                                "stdev": 0.22289233007223258
                            },
                            "true_positive_rate": {
                                "average": 0.5684931506849314,
                                "stdev": 0.2228923300722326
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 71,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.9726027397260274,
                            "precision": 1.0,
                            "recall": 0.9726027397260274,
                            "f1": 0.9861111111111112,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0273972602739726,
                            "true_positive_rate": 0.9726027397260274
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 41,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.5616438356164384,
                            "precision": 1.0,
                            "recall": 0.5616438356164384,
                            "f1": 0.7192982456140351,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4383561643835616,
                            "true_positive_rate": 0.5616438356164384
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 65,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.8904109589041096,
                            "precision": 1.0,
                            "recall": 0.8904109589041096,
                            "f1": 0.9420289855072463,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1095890410958904,
                            "true_positive_rate": 0.8904109589041096
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 41,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.5616438356164384,
                            "precision": 1.0,
                            "recall": 0.5616438356164384,
                            "f1": 0.7192982456140351,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4383561643835616,
                            "true_positive_rate": 0.5616438356164384
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7465753424657534,
                                "stdev": 0.1872006899824171
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7465753424657534,
                                "stdev": 0.1872006899824171
                            },
                            "f1": {
                                "average": 0.841684146961607,
                                "stdev": 0.12337428043711926
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2534246575342466,
                                "stdev": 0.1872006899824171
                            },
                            "true_positive_rate": {
                                "average": 0.7465753424657534,
                                "stdev": 0.1872006899824171
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 72,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.9863013698630136,
                            "precision": 1.0,
                            "recall": 0.9863013698630136,
                            "f1": 0.993103448275862,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0136986301369863,
                            "true_positive_rate": 0.9863013698630136
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 56,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.7671232876712328,
                            "precision": 1.0,
                            "recall": 0.7671232876712328,
                            "f1": 0.8682170542635659,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2328767123287671,
                            "true_positive_rate": 0.7671232876712328
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 65,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.8904109589041096,
                            "precision": 1.0,
                            "recall": 0.8904109589041096,
                            "f1": 0.9420289855072463,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1095890410958904,
                            "true_positive_rate": 0.8904109589041096
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 38,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.5205479452054794,
                            "precision": 1.0,
                            "recall": 0.5205479452054794,
                            "f1": 0.6846846846846847,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4794520547945205,
                            "true_positive_rate": 0.5205479452054794
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7910958904109588,
                                "stdev": 0.17445596749246614
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7910958904109588,
                                "stdev": 0.17445596749246614
                            },
                            "f1": {
                                "average": 0.8720085431828397,
                                "stdev": 0.11690962506745908
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2089041095890411,
                                "stdev": 0.17445596749246614
                            },
                            "true_positive_rate": {
                                "average": 0.7910958904109588,
                                "stdev": 0.17445596749246614
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 30,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.410958904109589,
                            "precision": 1.0,
                            "recall": 0.410958904109589,
                            "f1": 0.5825242718446602,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.589041095890411,
                            "true_positive_rate": 0.410958904109589
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 26,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.3561643835616438,
                            "precision": 1.0,
                            "recall": 0.3561643835616438,
                            "f1": 0.5252525252525253,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6438356164383562,
                            "true_positive_rate": 0.3561643835616438
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 8,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.1095890410958904,
                            "precision": 1.0,
                            "recall": 0.1095890410958904,
                            "f1": 0.19753086419753085,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8904109589041096,
                            "true_positive_rate": 0.1095890410958904
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 3,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.0410958904109589,
                            "precision": 1.0,
                            "recall": 0.0410958904109589,
                            "f1": 0.07894736842105263,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.958904109589041,
                            "true_positive_rate": 0.0410958904109589
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.22945205479452052,
                                "stdev": 0.15719886873326713
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.22945205479452052,
                                "stdev": 0.15719886873326713
                            },
                            "f1": {
                                "average": 0.34606375742894224,
                                "stdev": 0.2129761577267441
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7705479452054794,
                                "stdev": 0.1571988687332671
                            },
                            "true_positive_rate": {
                                "average": 0.22945205479452052,
                                "stdev": 0.15719886873326713
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 66,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.9041095890410958,
                            "precision": 1.0,
                            "recall": 0.9041095890410958,
                            "f1": 0.9496402877697842,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0958904109589041,
                            "true_positive_rate": 0.9041095890410958
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 65,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.8904109589041096,
                            "precision": 1.0,
                            "recall": 0.8904109589041096,
                            "f1": 0.9420289855072463,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1095890410958904,
                            "true_positive_rate": 0.8904109589041096
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 30,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.410958904109589,
                            "precision": 1.0,
                            "recall": 0.410958904109589,
                            "f1": 0.5825242718446602,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.589041095890411,
                            "true_positive_rate": 0.410958904109589
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 10,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.136986301369863,
                            "precision": 1.0,
                            "recall": 0.136986301369863,
                            "f1": 0.24096385542168675,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.863013698630137,
                            "true_positive_rate": 0.136986301369863
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5856164383561644,
                                "stdev": 0.32638621327757833
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5856164383561644,
                                "stdev": 0.32638621327757833
                            },
                            "f1": {
                                "average": 0.6787893501358443,
                                "stdev": 0.29309276055873407
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4143835616438356,
                                "stdev": 0.3263862132775784
                            },
                            "true_positive_rate": {
                                "average": 0.5856164383561644,
                                "stdev": 0.32638621327757833
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 23,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.3150684931506849,
                            "precision": 1.0,
                            "recall": 0.3150684931506849,
                            "f1": 0.4791666666666667,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.684931506849315,
                            "true_positive_rate": 0.3150684931506849
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 7,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.0958904109589041,
                            "precision": 1.0,
                            "recall": 0.0958904109589041,
                            "f1": 0.175,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9041095890410958,
                            "true_positive_rate": 0.0958904109589041
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 5,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.0684931506849315,
                            "precision": 1.0,
                            "recall": 0.0684931506849315,
                            "f1": 0.1282051282051282,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9315068493150684,
                            "true_positive_rate": 0.0684931506849315
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 0,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.11986301369863013,
                                "stdev": 0.11798926833528915
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.11986301369863013,
                                "stdev": 0.11798926833528915
                            },
                            "f1": {
                                "average": 0.19559294871794872,
                                "stdev": 0.1758094444205212
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8801369863013698,
                                "stdev": 0.11798926833528915
                            },
                            "true_positive_rate": {
                                "average": 0.11986301369863013,
                                "stdev": 0.11798926833528915
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 46,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.6301369863013698,
                            "precision": 1.0,
                            "recall": 0.6301369863013698,
                            "f1": 0.773109243697479,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3698630136986301,
                            "true_positive_rate": 0.6301369863013698
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 16,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.2191780821917808,
                            "precision": 1.0,
                            "recall": 0.2191780821917808,
                            "f1": 0.3595505617977528,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7808219178082192,
                            "true_positive_rate": 0.2191780821917808
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 15,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.2054794520547945,
                            "precision": 1.0,
                            "recall": 0.2054794520547945,
                            "f1": 0.3409090909090909,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7945205479452054,
                            "true_positive_rate": 0.2054794520547945
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 1,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.0136986301369863,
                            "precision": 1.0,
                            "recall": 0.0136986301369863,
                            "f1": 0.02702702702702703,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9863013698630136,
                            "true_positive_rate": 0.0136986301369863
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2671232876712329,
                                "stdev": 0.2247786175537252
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2671232876712329,
                                "stdev": 0.2247786175537252
                            },
                            "f1": {
                                "average": 0.3751489808578374,
                                "stdev": 0.26503629292950326
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.732876712328767,
                                "stdev": 0.2247786175537252
                            },
                            "true_positive_rate": {
                                "average": 0.2671232876712329,
                                "stdev": 0.2247786175537252
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 45,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.6164383561643836,
                            "precision": 1.0,
                            "recall": 0.6164383561643836,
                            "f1": 0.7627118644067796,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3835616438356164,
                            "true_positive_rate": 0.6164383561643836
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 12,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.1643835616438356,
                            "precision": 1.0,
                            "recall": 0.1643835616438356,
                            "f1": 0.2823529411764706,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8356164383561644,
                            "true_positive_rate": 0.1643835616438356
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 27,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.3698630136986301,
                            "precision": 1.0,
                            "recall": 0.3698630136986301,
                            "f1": 0.54,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6301369863013698,
                            "true_positive_rate": 0.3698630136986301
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 6,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.0821917808219178,
                            "precision": 1.0,
                            "recall": 0.0821917808219178,
                            "f1": 0.1518987341772152,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9178082191780822,
                            "true_positive_rate": 0.0821917808219178
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3082191780821918,
                                "stdev": 0.2065042935846758
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3082191780821918,
                                "stdev": 0.2065042935846758
                            },
                            "f1": {
                                "average": 0.43424088494011637,
                                "stdev": 0.23551285120360632
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6917808219178082,
                                "stdev": 0.20650429358467584
                            },
                            "true_positive_rate": {
                                "average": 0.3082191780821918,
                                "stdev": 0.2065042935846758
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 33,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.4520547945205479,
                            "precision": 1.0,
                            "recall": 0.4520547945205479,
                            "f1": 0.6226415094339622,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.547945205479452,
                            "true_positive_rate": 0.4520547945205479
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 15,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.2054794520547945,
                            "precision": 1.0,
                            "recall": 0.2054794520547945,
                            "f1": 0.3409090909090909,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7945205479452054,
                            "true_positive_rate": 0.2054794520547945
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 36,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.4931506849315068,
                            "precision": 1.0,
                            "recall": 0.4931506849315068,
                            "f1": 0.6605504587155964,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5068493150684932,
                            "true_positive_rate": 0.4931506849315068
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 8,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.1095890410958904,
                            "precision": 1.0,
                            "recall": 0.1095890410958904,
                            "f1": 0.19753086419753085,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8904109589041096,
                            "true_positive_rate": 0.1095890410958904
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3150684931506849,
                                "stdev": 0.16179468234270844
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3150684931506849,
                                "stdev": 0.16179468234270844
                            },
                            "f1": {
                                "average": 0.4554079808140451,
                                "stdev": 0.19343027968635831
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6849315068493151,
                                "stdev": 0.16179468234270844
                            },
                            "true_positive_rate": {
                                "average": 0.3150684931506849,
                                "stdev": 0.16179468234270844
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 30,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.410958904109589,
                            "precision": 1.0,
                            "recall": 0.410958904109589,
                            "f1": 0.5825242718446602,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.589041095890411,
                            "true_positive_rate": 0.410958904109589
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 35,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.4794520547945205,
                            "precision": 1.0,
                            "recall": 0.4794520547945205,
                            "f1": 0.6481481481481481,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5205479452054794,
                            "true_positive_rate": 0.4794520547945205
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 26,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.3561643835616438,
                            "precision": 1.0,
                            "recall": 0.3561643835616438,
                            "f1": 0.5252525252525253,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6438356164383562,
                            "true_positive_rate": 0.3561643835616438
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 11,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.1506849315068493,
                            "precision": 1.0,
                            "recall": 0.1506849315068493,
                            "f1": 0.2619047619047619,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8493150684931506,
                            "true_positive_rate": 0.1506849315068493
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3493150684931507,
                                "stdev": 0.12271556758334873
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3493150684931507,
                                "stdev": 0.12271556758334873
                            },
                            "f1": {
                                "average": 0.5044574267875239,
                                "stdev": 0.1466336310880028
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6506849315068493,
                                "stdev": 0.12271556758334873
                            },
                            "true_positive_rate": {
                                "average": 0.3493150684931507,
                                "stdev": 0.12271556758334873
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 73,
                        "prediction_error_num": 52,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.7123287671232876,
                            "precision": 1.0,
                            "recall": 0.7123287671232876,
                            "f1": 0.832,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2876712328767123,
                            "true_positive_rate": 0.7123287671232876
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 73,
                        "prediction_error_num": 55,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.7534246575342466,
                            "precision": 1.0,
                            "recall": 0.7534246575342466,
                            "f1": 0.859375,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2465753424657534,
                            "true_positive_rate": 0.7534246575342466
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 73,
                        "prediction_error_num": 43,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.589041095890411,
                            "precision": 1.0,
                            "recall": 0.589041095890411,
                            "f1": 0.7413793103448276,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.410958904109589,
                            "true_positive_rate": 0.589041095890411
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 73,
                        "prediction_error_num": 38,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.5205479452054794,
                            "precision": 1.0,
                            "recall": 0.5205479452054794,
                            "f1": 0.6846846846846847,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4794520547945205,
                            "true_positive_rate": 0.5205479452054794
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6438356164383562,
                                "stdev": 0.09341220340401271
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6438356164383562,
                                "stdev": 0.09341220340401271
                            },
                            "f1": {
                                "average": 0.779359748757378,
                                "stdev": 0.06996305800471489
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3561643835616438,
                                "stdev": 0.09341220340401271
                            },
                            "true_positive_rate": {
                                "average": 0.6438356164383562,
                                "stdev": 0.09341220340401271
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.4476961394769614,
                            "stdev": 0.2161167891813785
                        },
                        "precision": {
                            "average": 0.9772727272727273,
                            "stdev": 0.07186994682200862
                        },
                        "recall": {
                            "average": 0.4476961394769614,
                            "stdev": 0.2161167891813785
                        },
                        "f1": {
                            "average": 0.5617438490920595,
                            "stdev": 0.2132167356277922
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5523038605230386,
                            "stdev": 0.21611678918137847
                        },
                        "true_positive_rate": {
                            "average": 0.4476961394769614,
                            "stdev": 0.2161167891813785
                        }
                    }
                }
            }
        }
    },
    "answerability_classification": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 21,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.75,
                            "precision": 1.0,
                            "recall": 0.75,
                            "f1": 0.8571428571428571,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25,
                            "true_positive_rate": 0.75
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 20,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.7142857142857143,
                            "precision": 1.0,
                            "recall": 0.7142857142857143,
                            "f1": 0.8333333333333334,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2857142857142857,
                            "true_positive_rate": 0.7142857142857143
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 18,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6428571428571429,
                            "precision": 1.0,
                            "recall": 0.6428571428571429,
                            "f1": 0.782608695652174,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.35714285714285715,
                            "true_positive_rate": 0.6428571428571429
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 11,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.39285714285714285,
                            "precision": 1.0,
                            "recall": 0.39285714285714285,
                            "f1": 0.5641025641025641,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6071428571428571,
                            "true_positive_rate": 0.39285714285714285
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.625,
                                "stdev": 0.13946874421261884
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.625,
                                "stdev": 0.13946874421261884
                            },
                            "f1": {
                                "average": 0.7592968625577321,
                                "stdev": 0.11586575569693476
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.375,
                                "stdev": 0.13946874421261882
                            },
                            "true_positive_rate": {
                                "average": 0.625,
                                "stdev": 0.13946874421261884
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 27,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.9642857142857143,
                            "precision": 1.0,
                            "recall": 0.9642857142857143,
                            "f1": 0.9818181818181818,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.03571428571428571,
                            "true_positive_rate": 0.9642857142857143
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 19,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6785714285714286,
                            "precision": 1.0,
                            "recall": 0.6785714285714286,
                            "f1": 0.8085106382978723,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.32142857142857145,
                            "true_positive_rate": 0.6785714285714286
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 27,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.9642857142857143,
                            "precision": 1.0,
                            "recall": 0.9642857142857143,
                            "f1": 0.9818181818181818,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.03571428571428571,
                            "true_positive_rate": 0.9642857142857143
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 15,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5357142857142857,
                            "precision": 1.0,
                            "recall": 0.5357142857142857,
                            "f1": 0.6976744186046512,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4642857142857143,
                            "true_positive_rate": 0.5357142857142857
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7857142857142857,
                                "stdev": 0.18557687223952257
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7857142857142857,
                                "stdev": 0.18557687223952257
                            },
                            "f1": {
                                "average": 0.8674553551347217,
                                "stdev": 0.12089019636230482
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2142857142857143,
                                "stdev": 0.1855768722395226
                            },
                            "true_positive_rate": {
                                "average": 0.7857142857142857,
                                "stdev": 0.18557687223952257
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 24,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8571428571428571,
                            "precision": 1.0,
                            "recall": 0.8571428571428571,
                            "f1": 0.9230769230769231,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.14285714285714285,
                            "true_positive_rate": 0.8571428571428571
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 25,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8928571428571429,
                            "precision": 1.0,
                            "recall": 0.8928571428571429,
                            "f1": 0.9433962264150944,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.10714285714285714,
                            "true_positive_rate": 0.8928571428571429
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 15,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5357142857142857,
                            "precision": 1.0,
                            "recall": 0.5357142857142857,
                            "f1": 0.6976744186046512,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4642857142857143,
                            "true_positive_rate": 0.5357142857142857
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8214285714285714,
                                "stdev": 0.1731314234791546
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8214285714285714,
                                "stdev": 0.1731314234791546
                            },
                            "f1": {
                                "average": 0.8910368920241671,
                                "stdev": 0.11514129202244142
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.17857142857142858,
                                "stdev": 0.1731314234791546
                            },
                            "true_positive_rate": {
                                "average": 0.8214285714285714,
                                "stdev": 0.1731314234791546
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 15,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5357142857142857,
                            "precision": 1.0,
                            "recall": 0.5357142857142857,
                            "f1": 0.6976744186046512,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4642857142857143,
                            "true_positive_rate": 0.5357142857142857
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 13,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.4642857142857143,
                            "precision": 1.0,
                            "recall": 0.4642857142857143,
                            "f1": 0.6341463414634146,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5357142857142857,
                            "true_positive_rate": 0.4642857142857143
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 13,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.4642857142857143,
                            "precision": 1.0,
                            "recall": 0.4642857142857143,
                            "f1": 0.6341463414634146,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5357142857142857,
                            "true_positive_rate": 0.4642857142857143
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 5,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.17857142857142858,
                            "precision": 1.0,
                            "recall": 0.17857142857142858,
                            "f1": 0.30303030303030304,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8214285714285714,
                            "true_positive_rate": 0.17857142857142858
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.41071428571428575,
                                "stdev": 0.13716331692622513
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.41071428571428575,
                                "stdev": 0.13716331692622513
                            },
                            "f1": {
                                "average": 0.567249351140446,
                                "stdev": 0.1547359187743874
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5892857142857142,
                                "stdev": 0.13716331692622513
                            },
                            "true_positive_rate": {
                                "average": 0.41071428571428575,
                                "stdev": 0.13716331692622513
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 26,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.9285714285714286,
                            "precision": 1.0,
                            "recall": 0.9285714285714286,
                            "f1": 0.9629629629629629,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.07142857142857142,
                            "true_positive_rate": 0.9285714285714286
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 23,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8214285714285714,
                            "precision": 1.0,
                            "recall": 0.8214285714285714,
                            "f1": 0.9019607843137255,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.17857142857142858,
                            "true_positive_rate": 0.8214285714285714
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 14,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5,
                            "precision": 1.0,
                            "recall": 0.5,
                            "f1": 0.6666666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5,
                            "true_positive_rate": 0.5
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 8,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.2857142857142857,
                            "precision": 1.0,
                            "recall": 0.2857142857142857,
                            "f1": 0.4444444444444444,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7142857142857143,
                            "true_positive_rate": 0.2857142857142857
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6339285714285714,
                                "stdev": 0.2555194289509676
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6339285714285714,
                                "stdev": 0.2555194289509676
                            },
                            "f1": {
                                "average": 0.7440087145969498,
                                "stdev": 0.20531096143130814
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3660714285714286,
                                "stdev": 0.2555194289509676
                            },
                            "true_positive_rate": {
                                "average": 0.6339285714285714,
                                "stdev": 0.2555194289509676
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 9,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.32142857142857145,
                            "precision": 1.0,
                            "recall": 0.32142857142857145,
                            "f1": 0.4864864864864865,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6785714285714286,
                            "true_positive_rate": 0.32142857142857145
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 3,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.10714285714285714,
                            "precision": 1.0,
                            "recall": 0.10714285714285714,
                            "f1": 0.1935483870967742,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8928571428571429,
                            "true_positive_rate": 0.10714285714285714
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 8,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.2857142857142857,
                            "precision": 1.0,
                            "recall": 0.2857142857142857,
                            "f1": 0.4444444444444444,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7142857142857143,
                            "true_positive_rate": 0.2857142857142857
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 0,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.17857142857142858,
                                "stdev": 0.13122266479195596
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.17857142857142858,
                                "stdev": 0.13122266479195596
                            },
                            "f1": {
                                "average": 0.28111982950692627,
                                "stdev": 0.19719755938604236
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8214285714285715,
                                "stdev": 0.13122266479195596
                            },
                            "true_positive_rate": {
                                "average": 0.17857142857142858,
                                "stdev": 0.13122266479195596
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 12,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.42857142857142855,
                            "precision": 1.0,
                            "recall": 0.42857142857142855,
                            "f1": 0.6,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5714285714285714,
                            "true_positive_rate": 0.42857142857142855
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 13,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.4642857142857143,
                            "precision": 1.0,
                            "recall": 0.4642857142857143,
                            "f1": 0.6341463414634146,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5357142857142857,
                            "true_positive_rate": 0.4642857142857143
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 7,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 1,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.03571428571428571,
                            "precision": 1.0,
                            "recall": 0.03571428571428571,
                            "f1": 0.06896551724137931,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9642857142857143,
                            "true_positive_rate": 0.03571428571428571
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.29464285714285715,
                                "stdev": 0.170112132886229
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.29464285714285715,
                                "stdev": 0.170112132886229
                            },
                            "f1": {
                                "average": 0.4257779646761985,
                                "stdev": 0.2245831272294182
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7053571428571429,
                                "stdev": 0.17011213288622903
                            },
                            "true_positive_rate": {
                                "average": 0.29464285714285715,
                                "stdev": 0.170112132886229
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 17,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6071428571428571,
                            "precision": 1.0,
                            "recall": 0.6071428571428571,
                            "f1": 0.7555555555555555,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.39285714285714285,
                            "true_positive_rate": 0.6071428571428571
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 5,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.17857142857142858,
                            "precision": 1.0,
                            "recall": 0.17857142857142858,
                            "f1": 0.30303030303030304,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8214285714285714,
                            "true_positive_rate": 0.17857142857142858
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 15,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5357142857142857,
                            "precision": 1.0,
                            "recall": 0.5357142857142857,
                            "f1": 0.6976744186046512,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4642857142857143,
                            "true_positive_rate": 0.5357142857142857
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 5,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.17857142857142858,
                            "precision": 1.0,
                            "recall": 0.17857142857142858,
                            "f1": 0.30303030303030304,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8214285714285714,
                            "true_positive_rate": 0.17857142857142858
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.375,
                                "stdev": 0.198045294757311
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.375,
                                "stdev": 0.198045294757311
                            },
                            "f1": {
                                "average": 0.5148226450552033,
                                "stdev": 0.2127786981636871
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.625,
                                "stdev": 0.198045294757311
                            },
                            "true_positive_rate": {
                                "average": 0.375,
                                "stdev": 0.198045294757311
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 11,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.39285714285714285,
                            "precision": 1.0,
                            "recall": 0.39285714285714285,
                            "f1": 0.5641025641025641,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6071428571428571,
                            "true_positive_rate": 0.39285714285714285
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 7,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 11,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.39285714285714285,
                            "precision": 1.0,
                            "recall": 0.39285714285714285,
                            "f1": 0.5641025641025641,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6071428571428571,
                            "true_positive_rate": 0.39285714285714285
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 5,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.17857142857142858,
                            "precision": 1.0,
                            "recall": 0.17857142857142858,
                            "f1": 0.30303030303030304,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8214285714285714,
                            "true_positive_rate": 0.17857142857142858
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.30357142857142855,
                                "stdev": 0.09278843611976127
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.30357142857142855,
                                "stdev": 0.09278843611976127
                            },
                            "f1": {
                                "average": 0.45780885780885777,
                                "stdev": 0.1116859089767015
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6964285714285714,
                                "stdev": 0.0927884361197613
                            },
                            "true_positive_rate": {
                                "average": 0.30357142857142855,
                                "stdev": 0.09278843611976127
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 4,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.14285714285714285,
                            "precision": 1.0,
                            "recall": 0.14285714285714285,
                            "f1": 0.25,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8571428571428571,
                            "true_positive_rate": 0.14285714285714285
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 4,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.14285714285714285,
                            "precision": 1.0,
                            "recall": 0.14285714285714285,
                            "f1": 0.25,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8571428571428571,
                            "true_positive_rate": 0.14285714285714285
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 2,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.07142857142857142,
                            "precision": 1.0,
                            "recall": 0.07142857142857142,
                            "f1": 0.13333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9285714285714286,
                            "true_positive_rate": 0.07142857142857142
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 1,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.03571428571428571,
                            "precision": 1.0,
                            "recall": 0.03571428571428571,
                            "f1": 0.06896551724137931,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9642857142857143,
                            "true_positive_rate": 0.03571428571428571
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.0982142857142857,
                                "stdev": 0.046394218059880636
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.0982142857142857,
                                "stdev": 0.046394218059880636
                            },
                            "f1": {
                                "average": 0.17557471264367816,
                                "stdev": 0.0778268935948688
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9017857142857143,
                                "stdev": 0.04639421805988068
                            },
                            "true_positive_rate": {
                                "average": 0.0982142857142857,
                                "stdev": 0.046394218059880636
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 28,
                        "prediction_error_num": 3,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.10714285714285714,
                            "precision": 1.0,
                            "recall": 0.10714285714285714,
                            "f1": 0.1935483870967742,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8928571428571429,
                            "true_positive_rate": 0.10714285714285714
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 28,
                        "prediction_error_num": 1,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.03571428571428571,
                            "precision": 1.0,
                            "recall": 0.03571428571428571,
                            "f1": 0.06896551724137931,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9642857142857143,
                            "true_positive_rate": 0.03571428571428571
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 28,
                        "prediction_error_num": 4,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.14285714285714285,
                            "precision": 1.0,
                            "recall": 0.14285714285714285,
                            "f1": 0.25,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8571428571428571,
                            "true_positive_rate": 0.14285714285714285
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 28,
                        "prediction_error_num": 4,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.14285714285714285,
                            "precision": 1.0,
                            "recall": 0.14285714285714285,
                            "f1": 0.25,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8571428571428571,
                            "true_positive_rate": 0.14285714285714285
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.10714285714285714,
                                "stdev": 0.04374088826398532
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.10714285714285714,
                                "stdev": 0.04374088826398532
                            },
                            "f1": {
                                "average": 0.19062847608453837,
                                "stdev": 0.07392624219260718
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8928571428571429,
                                "stdev": 0.043740888263985346
                            },
                            "true_positive_rate": {
                                "average": 0.10714285714285714,
                                "stdev": 0.04374088826398532
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.42126623376623373,
                            "stdev": 0.24748907715802657
                        },
                        "precision": {
                            "average": 0.9772727272727273,
                            "stdev": 0.07186994682200862
                        },
                        "recall": {
                            "average": 0.42126623376623373,
                            "stdev": 0.24748907715802657
                        },
                        "f1": {
                            "average": 0.5340708782935836,
                            "stdev": 0.24547570837259755
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5787337662337663,
                            "stdev": 0.24748907715802654
                        },
                        "true_positive_rate": {
                            "average": 0.42126623376623373,
                            "stdev": 0.24748907715802657
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 14,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.23333333333333334,
                            "precision": 1.0,
                            "recall": 0.23333333333333334,
                            "f1": 0.3783783783783784,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7666666666666667,
                            "true_positive_rate": 0.23333333333333334
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 16,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.26666666666666666,
                            "precision": 1.0,
                            "recall": 0.26666666666666666,
                            "f1": 0.42105263157894735,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7333333333333333,
                            "true_positive_rate": 0.26666666666666666
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 19,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.31666666666666665,
                            "precision": 1.0,
                            "recall": 0.31666666666666665,
                            "f1": 0.4810126582278481,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6833333333333333,
                            "true_positive_rate": 0.31666666666666665
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 6,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.1,
                            "precision": 1.0,
                            "recall": 0.1,
                            "f1": 0.18181818181818182,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9,
                            "true_positive_rate": 0.1
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.22916666666666666,
                                "stdev": 0.08025566785107592
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.22916666666666666,
                                "stdev": 0.08025566785107592
                            },
                            "f1": {
                                "average": 0.36556546250083893,
                                "stdev": 0.11217632872523624
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7708333333333334,
                                "stdev": 0.08025566785107593
                            },
                            "true_positive_rate": {
                                "average": 0.22916666666666666,
                                "stdev": 0.08025566785107592
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 59,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.9833333333333333,
                            "precision": 1.0,
                            "recall": 0.9833333333333333,
                            "f1": 0.9915966386554622,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.016666666666666666,
                            "true_positive_rate": 0.9833333333333333
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 24,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.4,
                            "precision": 1.0,
                            "recall": 0.4,
                            "f1": 0.5714285714285714,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6,
                            "true_positive_rate": 0.4
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 57,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.95,
                            "precision": 1.0,
                            "recall": 0.95,
                            "f1": 0.9743589743589743,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.05,
                            "true_positive_rate": 0.95
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 57,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.95,
                            "precision": 1.0,
                            "recall": 0.95,
                            "f1": 0.9743589743589743,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.05,
                            "true_positive_rate": 0.95
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8208333333333333,
                                "stdev": 0.24334902917414727
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8208333333333333,
                                "stdev": 0.24334902917414727
                            },
                            "f1": {
                                "average": 0.8779357897004956,
                                "stdev": 0.17710189476493673
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.1791666666666667,
                                "stdev": 0.2433490291741473
                            },
                            "true_positive_rate": {
                                "average": 0.8208333333333333,
                                "stdev": 0.24334902917414727
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 50,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.8333333333333334,
                            "precision": 1.0,
                            "recall": 0.8333333333333334,
                            "f1": 0.9090909090909091,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.16666666666666666,
                            "true_positive_rate": 0.8333333333333334
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 10,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.16666666666666666,
                            "precision": 1.0,
                            "recall": 0.16666666666666666,
                            "f1": 0.2857142857142857,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8333333333333334,
                            "true_positive_rate": 0.16666666666666666
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 48,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.8,
                            "precision": 1.0,
                            "recall": 0.8,
                            "f1": 0.8888888888888888,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2,
                            "true_positive_rate": 0.8
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 0,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.45,
                                "stdev": 0.3715582801601326
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.45,
                                "stdev": 0.3715582801601326
                            },
                            "f1": {
                                "average": 0.5209235209235209,
                                "stdev": 0.39139402532411366
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.55,
                                "stdev": 0.37155828016013254
                            },
                            "true_positive_rate": {
                                "average": 0.45,
                                "stdev": 0.3715582801601326
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 16,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.26666666666666666,
                            "precision": 1.0,
                            "recall": 0.26666666666666666,
                            "f1": 0.42105263157894735,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7333333333333333,
                            "true_positive_rate": 0.26666666666666666
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 13,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.21666666666666667,
                            "precision": 1.0,
                            "recall": 0.21666666666666667,
                            "f1": 0.3561643835616438,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7833333333333333,
                            "true_positive_rate": 0.21666666666666667
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 19,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.31666666666666665,
                            "precision": 1.0,
                            "recall": 0.31666666666666665,
                            "f1": 0.4810126582278481,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6833333333333333,
                            "true_positive_rate": 0.31666666666666665
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 1,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.016666666666666666,
                            "precision": 1.0,
                            "recall": 0.016666666666666666,
                            "f1": 0.03278688524590164,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9833333333333333,
                            "true_positive_rate": 0.016666666666666666
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2041666666666667,
                                "stdev": 0.11388041973930374
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2041666666666667,
                                "stdev": 0.11388041973930374
                            },
                            "f1": {
                                "average": 0.32275413965358524,
                                "stdev": 0.17313694380666353
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7958333333333334,
                                "stdev": 0.11388041973930371
                            },
                            "true_positive_rate": {
                                "average": 0.2041666666666667,
                                "stdev": 0.11388041973930374
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 59,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.9833333333333333,
                            "precision": 1.0,
                            "recall": 0.9833333333333333,
                            "f1": 0.9915966386554622,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.016666666666666666,
                            "true_positive_rate": 0.9833333333333333
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 51,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.85,
                            "precision": 1.0,
                            "recall": 0.85,
                            "f1": 0.918918918918919,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.15,
                            "true_positive_rate": 0.85
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 1,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.016666666666666666,
                            "precision": 1.0,
                            "recall": 0.016666666666666666,
                            "f1": 0.03278688524590164,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9833333333333333,
                            "true_positive_rate": 0.016666666666666666
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 2,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.03333333333333333,
                            "precision": 1.0,
                            "recall": 0.03333333333333333,
                            "f1": 0.06451612903225806,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9666666666666667,
                            "true_positive_rate": 0.03333333333333333
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4708333333333333,
                                "stdev": 0.4483573413646258
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4708333333333333,
                                "stdev": 0.4483573413646258
                            },
                            "f1": {
                                "average": 0.5019546429631352,
                                "stdev": 0.4541693873766611
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5291666666666667,
                                "stdev": 0.44835734136462574
                            },
                            "true_positive_rate": {
                                "average": 0.4708333333333333,
                                "stdev": 0.4483573413646258
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 4,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.06666666666666667,
                            "precision": 1.0,
                            "recall": 0.06666666666666667,
                            "f1": 0.125,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9333333333333333,
                            "true_positive_rate": 0.06666666666666667
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 0,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 1,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.016666666666666666,
                            "precision": 1.0,
                            "recall": 0.016666666666666666,
                            "f1": 0.03278688524590164,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9833333333333333,
                            "true_positive_rate": 0.016666666666666666
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 0,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.020833333333333332,
                                "stdev": 0.027322660517925007
                            },
                            "precision": {
                                "average": 0.5,
                                "stdev": 0.5
                            },
                            "recall": {
                                "average": 0.020833333333333332,
                                "stdev": 0.027322660517925007
                            },
                            "f1": {
                                "average": 0.03944672131147541,
                                "stdev": 0.051175688943157105
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9791666666666666,
                                "stdev": 0.027322660517924993
                            },
                            "true_positive_rate": {
                                "average": 0.020833333333333332,
                                "stdev": 0.027322660517925007
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 2,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.03333333333333333,
                            "precision": 1.0,
                            "recall": 0.03333333333333333,
                            "f1": 0.06451612903225806,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9666666666666667,
                            "true_positive_rate": 0.03333333333333333
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 1,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.016666666666666666,
                            "precision": 1.0,
                            "recall": 0.016666666666666666,
                            "f1": 0.03278688524590164,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9833333333333333,
                            "true_positive_rate": 0.016666666666666666
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 0,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 0,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.0125,
                                "stdev": 0.013819269959814166
                            },
                            "precision": {
                                "average": 0.5,
                                "stdev": 0.5
                            },
                            "recall": {
                                "average": 0.0125,
                                "stdev": 0.013819269959814166
                            },
                            "f1": {
                                "average": 0.024325753569539928,
                                "stdev": 0.026787784541330385
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9875,
                                "stdev": 0.013819269959814168
                            },
                            "true_positive_rate": {
                                "average": 0.0125,
                                "stdev": 0.013819269959814166
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 9,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.15,
                            "precision": 1.0,
                            "recall": 0.15,
                            "f1": 0.2608695652173913,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.85,
                            "true_positive_rate": 0.15
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 2,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.03333333333333333,
                            "precision": 1.0,
                            "recall": 0.03333333333333333,
                            "f1": 0.06451612903225806,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9666666666666667,
                            "true_positive_rate": 0.03333333333333333
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 5,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.08333333333333333,
                            "precision": 1.0,
                            "recall": 0.08333333333333333,
                            "f1": 0.15384615384615385,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9166666666666666,
                            "true_positive_rate": 0.08333333333333333
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 2,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.03333333333333333,
                            "precision": 1.0,
                            "recall": 0.03333333333333333,
                            "f1": 0.06451612903225806,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9666666666666667,
                            "true_positive_rate": 0.03333333333333333
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.075,
                                "stdev": 0.04787135538781691
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.075,
                                "stdev": 0.04787135538781691
                            },
                            "f1": {
                                "average": 0.13593699428201533,
                                "stdev": 0.08082506613493194
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.925,
                                "stdev": 0.04787135538781692
                            },
                            "true_positive_rate": {
                                "average": 0.075,
                                "stdev": 0.04787135538781691
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 12,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.2,
                            "precision": 1.0,
                            "recall": 0.2,
                            "f1": 0.3333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8,
                            "true_positive_rate": 0.2
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 8,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.13333333333333333,
                            "precision": 1.0,
                            "recall": 0.13333333333333333,
                            "f1": 0.23529411764705882,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8666666666666667,
                            "true_positive_rate": 0.13333333333333333
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 15,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 5,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.08333333333333333,
                            "precision": 1.0,
                            "recall": 0.08333333333333333,
                            "f1": 0.15384615384615385,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9166666666666666,
                            "true_positive_rate": 0.08333333333333333
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.16666666666666669,
                                "stdev": 0.06346477588219925
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.16666666666666669,
                                "stdev": 0.06346477588219925
                            },
                            "f1": {
                                "average": 0.2806184012066365,
                                "stdev": 0.09375009810741303
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8333333333333334,
                                "stdev": 0.06346477588219922
                            },
                            "true_positive_rate": {
                                "average": 0.16666666666666669,
                                "stdev": 0.06346477588219925
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 38,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.6333333333333333,
                            "precision": 1.0,
                            "recall": 0.6333333333333333,
                            "f1": 0.7755102040816326,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.36666666666666664,
                            "true_positive_rate": 0.6333333333333333
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 40,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.6666666666666666,
                            "precision": 1.0,
                            "recall": 0.6666666666666666,
                            "f1": 0.8,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3333333333333333,
                            "true_positive_rate": 0.6666666666666666
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 37,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.6166666666666667,
                            "precision": 1.0,
                            "recall": 0.6166666666666667,
                            "f1": 0.7628865979381443,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.38333333333333336,
                            "true_positive_rate": 0.6166666666666667
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 38,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.6333333333333333,
                            "precision": 1.0,
                            "recall": 0.6333333333333333,
                            "f1": 0.7755102040816326,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.36666666666666664,
                            "true_positive_rate": 0.6333333333333333
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6375,
                                "stdev": 0.018162078931419454
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6375,
                                "stdev": 0.018162078931419454
                            },
                            "f1": {
                                "average": 0.7784767515253525,
                                "stdev": 0.013452731332761397
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3625,
                                "stdev": 0.018162078931419485
                            },
                            "true_positive_rate": {
                                "average": 0.6375,
                                "stdev": 0.018162078931419454
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 60,
                        "prediction_error_num": 48,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.8,
                            "precision": 1.0,
                            "recall": 0.8,
                            "f1": 0.8888888888888888,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2,
                            "true_positive_rate": 0.8
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 60,
                        "prediction_error_num": 52,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.8666666666666667,
                            "precision": 1.0,
                            "recall": 0.8666666666666667,
                            "f1": 0.9285714285714286,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.13333333333333333,
                            "true_positive_rate": 0.8666666666666667
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 60,
                        "prediction_error_num": 44,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.7333333333333333,
                            "precision": 1.0,
                            "recall": 0.7333333333333333,
                            "f1": 0.8461538461538461,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.26666666666666666,
                            "true_positive_rate": 0.7333333333333333
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 60,
                        "prediction_error_num": 46,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.7666666666666667,
                            "precision": 1.0,
                            "recall": 0.7666666666666667,
                            "f1": 0.8679245283018868,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.23333333333333334,
                            "true_positive_rate": 0.7666666666666667
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7916666666666666,
                                "stdev": 0.04930066485916349
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7916666666666666,
                                "stdev": 0.04930066485916349
                            },
                            "f1": {
                                "average": 0.8828846729790125,
                                "stdev": 0.030398560053374518
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.20833333333333337,
                                "stdev": 0.04930066485916347
                            },
                            "true_positive_rate": {
                                "average": 0.7916666666666666,
                                "stdev": 0.04930066485916349
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.3526515151515151,
                            "stdev": 0.28465197122086916
                        },
                        "precision": {
                            "average": 0.8863636363636364,
                            "stdev": 0.19550739243278697
                        },
                        "recall": {
                            "average": 0.3526515151515151,
                            "stdev": 0.28465197122086916
                        },
                        "f1": {
                            "average": 0.4300748046014189,
                            "stdev": 0.29881225423652563
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6473484848484848,
                            "stdev": 0.28465197122086916
                        },
                        "true_positive_rate": {
                            "average": 0.3526515151515151,
                            "stdev": 0.28465197122086916
                        }
                    }
                }
            }
        },
        "Parameterized Knowledge": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 42,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.6721311475409836,
                            "precision": 0.9761904761904762,
                            "recall": 0.6833333333333333,
                            "f1": 0.803921568627451,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.3114754098360656,
                            "true_positive_rate": 0.6721311475409836
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 39,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.6229508196721312,
                            "precision": 0.9743589743589743,
                            "recall": 0.6333333333333333,
                            "f1": 0.7676767676767676,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.36065573770491804,
                            "true_positive_rate": 0.6229508196721312
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 40,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.639344262295082,
                            "precision": 0.975,
                            "recall": 0.65,
                            "f1": 0.78,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.3442622950819672,
                            "true_positive_rate": 0.639344262295082
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 18,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.3114754098360656,
                            "precision": 1.0,
                            "recall": 0.3,
                            "f1": 0.46153846153846156,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6885245901639344,
                            "true_positive_rate": 0.29508196721311475
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5614754098360656,
                                "stdev": 0.14541963114106699
                            },
                            "precision": {
                                "average": 0.9813873626373626,
                                "stdev": 0.010766087283547342
                            },
                            "recall": {
                                "average": 0.5666666666666667,
                                "stdev": 0.15500896031448563
                            },
                            "f1": {
                                "average": 0.70328419946067,
                                "stdev": 0.14017899162340544
                            },
                            "true_negative_rate": {
                                "average": 0.004098360655737705,
                                "stdev": 0.007098568883479006
                            },
                            "false_positive_rate": {
                                "average": 0.012295081967213115,
                                "stdev": 0.007098568883479006
                            },
                            "false_negative_rate": {
                                "average": 0.42622950819672134,
                                "stdev": 0.15246782981752682
                            },
                            "true_positive_rate": {
                                "average": 0.5573770491803278,
                                "stdev": 0.15246782981752685
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.9836065573770492,
                            "precision": 0.9836065573770492,
                            "recall": 1.0,
                            "f1": 0.9917355371900827,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 0.9836065573770492
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 41,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.6557377049180327,
                            "precision": 0.975609756097561,
                            "recall": 0.6666666666666666,
                            "f1": 0.7920792079207921,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.32786885245901637,
                            "true_positive_rate": 0.6557377049180327
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.9672131147540983,
                            "precision": 0.9833333333333333,
                            "recall": 0.9833333333333333,
                            "f1": 0.9833333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.01639344262295082,
                            "true_positive_rate": 0.9672131147540983
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 35,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.5901639344262295,
                            "precision": 1.0,
                            "recall": 0.5833333333333334,
                            "f1": 0.7368421052631579,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4098360655737705,
                            "true_positive_rate": 0.5737704918032787
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7991803278688525,
                                "stdev": 0.1778424094569437
                            },
                            "precision": {
                                "average": 0.9856374117019858,
                                "stdev": 0.008892004357247215
                            },
                            "recall": {
                                "average": 0.8083333333333333,
                                "stdev": 0.18577914008006627
                            },
                            "f1": {
                                "average": 0.8759975459268415,
                                "stdev": 0.11327265557033526
                            },
                            "true_negative_rate": {
                                "average": 0.004098360655737705,
                                "stdev": 0.007098568883479006
                            },
                            "false_positive_rate": {
                                "average": 0.012295081967213115,
                                "stdev": 0.007098568883479006
                            },
                            "false_negative_rate": {
                                "average": 0.1885245901639344,
                                "stdev": 0.18273358040662258
                            },
                            "true_positive_rate": {
                                "average": 0.7950819672131147,
                                "stdev": 0.18273358040662258
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 56,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.9016393442622951,
                            "precision": 0.9821428571428571,
                            "recall": 0.9166666666666666,
                            "f1": 0.9482758620689655,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.08196721311475409,
                            "true_positive_rate": 0.9016393442622951
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 51,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.819672131147541,
                            "precision": 0.9803921568627451,
                            "recall": 0.8333333333333334,
                            "f1": 0.9009009009009009,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.16393442622950818,
                            "true_positive_rate": 0.819672131147541
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 55,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.8852459016393442,
                            "precision": 0.9818181818181818,
                            "recall": 0.9,
                            "f1": 0.9391304347826087,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.09836065573770492,
                            "true_positive_rate": 0.8852459016393442
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 26,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.4426229508196721,
                            "precision": 1.0,
                            "recall": 0.43333333333333335,
                            "f1": 0.6046511627906976,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5573770491803278,
                            "true_positive_rate": 0.4262295081967213
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7622950819672131,
                                "stdev": 0.18709364279530044
                            },
                            "precision": {
                                "average": 0.986088298955946,
                                "stdev": 0.00805887511002259
                            },
                            "recall": {
                                "average": 0.7708333333333333,
                                "stdev": 0.19733467060357696
                            },
                            "f1": {
                                "average": 0.8482395901357931,
                                "stdev": 0.14175413303573842
                            },
                            "true_negative_rate": {
                                "average": 0.004098360655737705,
                                "stdev": 0.007098568883479006
                            },
                            "false_positive_rate": {
                                "average": 0.012295081967213115,
                                "stdev": 0.007098568883479006
                            },
                            "false_negative_rate": {
                                "average": 0.22540983606557374,
                                "stdev": 0.19409967600351832
                            },
                            "true_positive_rate": {
                                "average": 0.7581967213114754,
                                "stdev": 0.19409967600351832
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 22,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.3770491803278688,
                            "precision": 1.0,
                            "recall": 0.36666666666666664,
                            "f1": 0.5365853658536586,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6229508196721312,
                            "true_positive_rate": 0.36065573770491804
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 23,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.39344262295081966,
                            "precision": 1.0,
                            "recall": 0.38333333333333336,
                            "f1": 0.5542168674698795,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6065573770491803,
                            "true_positive_rate": 0.3770491803278688
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 12,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.21311475409836064,
                            "precision": 1.0,
                            "recall": 0.2,
                            "f1": 0.3333333333333333,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7868852459016393,
                            "true_positive_rate": 0.19672131147540983
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 9,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.16393442622950818,
                            "precision": 1.0,
                            "recall": 0.15,
                            "f1": 0.2608695652173913,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8360655737704918,
                            "true_positive_rate": 0.14754098360655737
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.28688524590163933,
                                "stdev": 0.10005373455519428
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.27499999999999997,
                                "stdev": 0.10172129679778086
                            },
                            "f1": {
                                "average": 0.4212512829685657,
                                "stdev": 0.1269189294329279
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7131147540983607,
                                "stdev": 0.10005373455519428
                            },
                            "true_positive_rate": {
                                "average": 0.2704918032786885,
                                "stdev": 0.10005373455519428
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 54,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.8688524590163934,
                            "precision": 0.9814814814814815,
                            "recall": 0.8833333333333333,
                            "f1": 0.9298245614035088,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.11475409836065574,
                            "true_positive_rate": 0.8688524590163934
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 45,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.7213114754098361,
                            "precision": 0.9777777777777777,
                            "recall": 0.7333333333333333,
                            "f1": 0.8380952380952381,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.26229508196721313,
                            "true_positive_rate": 0.7213114754098361
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 22,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.3442622950819672,
                            "precision": 0.9545454545454546,
                            "recall": 0.35,
                            "f1": 0.5121951219512195,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.639344262295082,
                            "true_positive_rate": 0.3442622950819672
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 7,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.13114754098360656,
                            "precision": 1.0,
                            "recall": 0.11666666666666667,
                            "f1": 0.208955223880597,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8688524590163934,
                            "true_positive_rate": 0.11475409836065574
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5163934426229508,
                                "stdev": 0.29336934701923406
                            },
                            "precision": {
                                "average": 0.9784511784511785,
                                "stdev": 0.016166876248382478
                            },
                            "recall": {
                                "average": 0.5208333333333334,
                                "stdev": 0.3037668605288528
                            },
                            "f1": {
                                "average": 0.6222675363326409,
                                "stdev": 0.28465589953080994
                            },
                            "true_negative_rate": {
                                "average": 0.004098360655737705,
                                "stdev": 0.007098568883479006
                            },
                            "false_positive_rate": {
                                "average": 0.012295081967213115,
                                "stdev": 0.007098568883479006
                            },
                            "false_negative_rate": {
                                "average": 0.4713114754098361,
                                "stdev": 0.2987870759300192
                            },
                            "true_positive_rate": {
                                "average": 0.5122950819672131,
                                "stdev": 0.2987870759300192
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 7,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.13114754098360656,
                            "precision": 1.0,
                            "recall": 0.11666666666666667,
                            "f1": 0.208955223880597,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8688524590163934,
                            "true_positive_rate": 0.11475409836065574
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 2,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.04918032786885246,
                            "precision": 1.0,
                            "recall": 0.03333333333333333,
                            "f1": 0.06451612903225806,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9508196721311475,
                            "true_positive_rate": 0.03278688524590164
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 7,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.13114754098360656,
                            "precision": 1.0,
                            "recall": 0.11666666666666667,
                            "f1": 0.208955223880597,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8688524590163934,
                            "true_positive_rate": 0.11475409836065574
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 0,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.01639344262295082,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9836065573770492,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.0819672131147541,
                                "stdev": 0.050527983630893246
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.06666666666666667,
                                "stdev": 0.05137011669140814
                            },
                            "f1": {
                                "average": 0.12060664419836302,
                                "stdev": 0.09124561849771977
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9180327868852458,
                                "stdev": 0.050527983630893246
                            },
                            "true_positive_rate": {
                                "average": 0.06557377049180328,
                                "stdev": 0.050527983630893246
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 16,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.2786885245901639,
                            "precision": 1.0,
                            "recall": 0.26666666666666666,
                            "f1": 0.42105263157894735,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7213114754098361,
                            "true_positive_rate": 0.26229508196721313
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 14,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.2459016393442623,
                            "precision": 1.0,
                            "recall": 0.23333333333333334,
                            "f1": 0.3783783783783784,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7540983606557377,
                            "true_positive_rate": 0.22950819672131148
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 10,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.18032786885245902,
                            "precision": 1.0,
                            "recall": 0.16666666666666666,
                            "f1": 0.2857142857142857,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.819672131147541,
                            "true_positive_rate": 0.16393442622950818
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 0,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.01639344262295082,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9836065573770492,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.180327868852459,
                                "stdev": 0.10105596726178649
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.16666666666666666,
                                "stdev": 0.10274023338281628
                            },
                            "f1": {
                                "average": 0.27128632391790286,
                                "stdev": 0.1640907452185187
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.819672131147541,
                                "stdev": 0.10105596726178649
                            },
                            "true_positive_rate": {
                                "average": 0.16393442622950818,
                                "stdev": 0.1010559672617865
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 36,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.5737704918032787,
                            "precision": 0.9722222222222222,
                            "recall": 0.5833333333333334,
                            "f1": 0.7291666666666666,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.4098360655737705,
                            "true_positive_rate": 0.5737704918032787
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 6,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.11475409836065574,
                            "precision": 1.0,
                            "recall": 0.1,
                            "f1": 0.18181818181818182,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8852459016393442,
                            "true_positive_rate": 0.09836065573770492
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 27,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.45901639344262296,
                            "precision": 1.0,
                            "recall": 0.45,
                            "f1": 0.6206896551724138,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5409836065573771,
                            "true_positive_rate": 0.4426229508196721
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 7,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.13114754098360656,
                            "precision": 1.0,
                            "recall": 0.11666666666666667,
                            "f1": 0.208955223880597,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8688524590163934,
                            "true_positive_rate": 0.11475409836065574
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.31967213114754095,
                                "stdev": 0.20094509298575838
                            },
                            "precision": {
                                "average": 0.9930555555555556,
                                "stdev": 0.012028130608117209
                            },
                            "recall": {
                                "average": 0.3125,
                                "stdev": 0.20962102046842113
                            },
                            "f1": {
                                "average": 0.4351574318844648,
                                "stdev": 0.24300815343592744
                            },
                            "true_negative_rate": {
                                "average": 0.012295081967213115,
                                "stdev": 0.0070985688834790065
                            },
                            "false_positive_rate": {
                                "average": 0.004098360655737705,
                                "stdev": 0.007098568883479005
                            },
                            "false_negative_rate": {
                                "average": 0.6762295081967212,
                                "stdev": 0.20618461029680762
                            },
                            "true_positive_rate": {
                                "average": 0.3073770491803278,
                                "stdev": 0.20618461029680762
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 20,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.3114754098360656,
                            "precision": 0.95,
                            "recall": 0.31666666666666665,
                            "f1": 0.475,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.6721311475409836,
                            "true_positive_rate": 0.3114754098360656
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 15,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.26229508196721313,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7377049180327869,
                            "true_positive_rate": 0.2459016393442623
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 21,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.36065573770491804,
                            "precision": 1.0,
                            "recall": 0.35,
                            "f1": 0.5185185185185185,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.639344262295082,
                            "true_positive_rate": 0.3442622950819672
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 12,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.21311475409836064,
                            "precision": 1.0,
                            "recall": 0.2,
                            "f1": 0.3333333333333333,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7868852459016393,
                            "true_positive_rate": 0.19672131147540983
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.28688524590163933,
                                "stdev": 0.054985278135240734
                            },
                            "precision": {
                                "average": 0.9875,
                                "stdev": 0.021650635094610984
                            },
                            "recall": {
                                "average": 0.2791666666666667,
                                "stdev": 0.05818433351570391
                            },
                            "f1": {
                                "average": 0.43171296296296297,
                                "stdev": 0.07087530528454147
                            },
                            "true_negative_rate": {
                                "average": 0.012295081967213115,
                                "stdev": 0.0070985688834790065
                            },
                            "false_positive_rate": {
                                "average": 0.004098360655737705,
                                "stdev": 0.007098568883479005
                            },
                            "false_negative_rate": {
                                "average": 0.709016393442623,
                                "stdev": 0.05723049198265959
                            },
                            "true_positive_rate": {
                                "average": 0.2745901639344262,
                                "stdev": 0.057230491982659594
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 9,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.16393442622950818,
                            "precision": 1.0,
                            "recall": 0.15,
                            "f1": 0.2608695652173913,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8360655737704918,
                            "true_positive_rate": 0.14754098360655737
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 8,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.14754098360655737,
                            "precision": 1.0,
                            "recall": 0.13333333333333333,
                            "f1": 0.23529411764705882,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8524590163934426,
                            "true_positive_rate": 0.13114754098360656
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 8,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.14754098360655737,
                            "precision": 1.0,
                            "recall": 0.13333333333333333,
                            "f1": 0.23529411764705882,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8524590163934426,
                            "true_positive_rate": 0.13114754098360656
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 6,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.11475409836065574,
                            "precision": 1.0,
                            "recall": 0.1,
                            "f1": 0.18181818181818182,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8852459016393442,
                            "true_positive_rate": 0.09836065573770492
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.14344262295081966,
                                "stdev": 0.01786433993254374
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.12916666666666665,
                                "stdev": 0.018162078931419467
                            },
                            "f1": {
                                "average": 0.2283189955824227,
                                "stdev": 0.028806118794149883
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8565573770491803,
                                "stdev": 0.017864339932543733
                            },
                            "true_positive_rate": {
                                "average": 0.12704918032786885,
                                "stdev": 0.017864339932543744
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 61,
                        "prediction_error_num": 10,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.18032786885245902,
                            "precision": 1.0,
                            "recall": 0.16666666666666666,
                            "f1": 0.2857142857142857,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.819672131147541,
                            "true_positive_rate": 0.16393442622950818
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 61,
                        "prediction_error_num": 10,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.18032786885245902,
                            "precision": 1.0,
                            "recall": 0.16666666666666666,
                            "f1": 0.2857142857142857,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.819672131147541,
                            "true_positive_rate": 0.16393442622950818
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 61,
                        "prediction_error_num": 7,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.13114754098360656,
                            "precision": 1.0,
                            "recall": 0.11666666666666667,
                            "f1": 0.208955223880597,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8688524590163934,
                            "true_positive_rate": 0.11475409836065574
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 61,
                        "prediction_error_num": 7,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.13114754098360656,
                            "precision": 1.0,
                            "recall": 0.11666666666666667,
                            "f1": 0.208955223880597,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8688524590163934,
                            "true_positive_rate": 0.11475409836065574
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1557377049180328,
                                "stdev": 0.02459016393442623
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.14166666666666666,
                                "stdev": 0.024999999999999994
                            },
                            "f1": {
                                "average": 0.24733475479744133,
                                "stdev": 0.038379530916844345
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8442622950819672,
                                "stdev": 0.0245901639344262
                            },
                            "true_positive_rate": {
                                "average": 0.13934426229508196,
                                "stdev": 0.024590163934426222
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.3722056631892698,
                            "stdev": 0.2390617155490944
                        },
                        "precision": {
                            "average": 0.9465563461183663,
                            "stdev": 0.09293038015560447
                        },
                        "recall": {
                            "average": 0.3670454545454545,
                            "stdev": 0.2483052057581789
                        },
                        "f1": {
                            "average": 0.4732233880152789,
                            "stdev": 0.24494907138168118
                        },
                        "true_negative_rate": {
                            "average": 0.011177347242921014,
                            "stdev": 0.0055512907696540015
                        },
                        "false_positive_rate": {
                            "average": 0.005216095380029807,
                            "stdev": 0.0055512907696540015
                        },
                        "false_negative_rate": {
                            "average": 0.6225782414307004,
                            "stdev": 0.2442346286146022
                        },
                        "true_positive_rate": {
                            "average": 0.36102831594634877,
                            "stdev": 0.2442346286146022
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 28,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.3253012048192771,
                            "precision": 0.9642857142857143,
                            "recall": 0.32926829268292684,
                            "f1": 0.4909090909090909,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.6626506024096386,
                            "true_positive_rate": 0.3253012048192771
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 23,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.26506024096385544,
                            "precision": 0.9565217391304348,
                            "recall": 0.2682926829268293,
                            "f1": 0.41904761904761906,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.7228915662650602,
                            "true_positive_rate": 0.26506024096385544
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 28,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.3493975903614458,
                            "precision": 1.0,
                            "recall": 0.34146341463414637,
                            "f1": 0.509090909090909,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6506024096385542,
                            "true_positive_rate": 0.3373493975903614
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 4,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.060240963855421686,
                            "precision": 1.0,
                            "recall": 0.04878048780487805,
                            "f1": 0.09302325581395349,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9397590361445783,
                            "true_positive_rate": 0.04819277108433735
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.11378208896098857
                            },
                            "precision": {
                                "average": 0.9802018633540373,
                                "stdev": 0.019987524319652165
                            },
                            "recall": {
                                "average": 0.24695121951219517,
                                "stdev": 0.11772399131895016
                            },
                            "f1": {
                                "average": 0.3780177187153931,
                                "stdev": 0.16795084084096207
                            },
                            "true_negative_rate": {
                                "average": 0.006024096385542169,
                                "stdev": 0.006024096385542169
                            },
                            "false_positive_rate": {
                                "average": 0.006024096385542169,
                                "stdev": 0.006024096385542169
                            },
                            "false_negative_rate": {
                                "average": 0.7439759036144579,
                                "stdev": 0.11630562997775798
                            },
                            "true_positive_rate": {
                                "average": 0.24397590361445784,
                                "stdev": 0.11630562997775797
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 81,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.963855421686747,
                            "precision": 0.9876543209876543,
                            "recall": 0.975609756097561,
                            "f1": 0.9815950920245399,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.024096385542168676,
                            "true_positive_rate": 0.963855421686747
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 37,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.43373493975903615,
                            "precision": 0.972972972972973,
                            "recall": 0.43902439024390244,
                            "f1": 0.6050420168067226,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.5542168674698795,
                            "true_positive_rate": 0.43373493975903615
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 80,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.9518072289156626,
                            "precision": 0.9875,
                            "recall": 0.9634146341463414,
                            "f1": 0.9753086419753086,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.03614457831325301,
                            "true_positive_rate": 0.9518072289156626
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 60,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.7349397590361446,
                            "precision": 1.0,
                            "recall": 0.7317073170731707,
                            "f1": 0.8450704225352113,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.26506024096385544,
                            "true_positive_rate": 0.7228915662650602
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7710843373493976,
                                "stdev": 0.21501889515150296
                            },
                            "precision": {
                                "average": 0.9870318234901568,
                                "stdev": 0.009571201233828059
                            },
                            "recall": {
                                "average": 0.7774390243902438,
                                "stdev": 0.21821687605917553
                            },
                            "f1": {
                                "average": 0.8517540433354456,
                                "stdev": 0.15250896005698486
                            },
                            "true_negative_rate": {
                                "average": 0.0030120481927710845,
                                "stdev": 0.005217020504725535
                            },
                            "false_positive_rate": {
                                "average": 0.009036144578313254,
                                "stdev": 0.005217020504725534
                            },
                            "false_negative_rate": {
                                "average": 0.21987951807228917,
                                "stdev": 0.21558775707051078
                            },
                            "true_positive_rate": {
                                "average": 0.7680722891566265,
                                "stdev": 0.21558775707051078
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 70,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.8313253012048193,
                            "precision": 0.9857142857142858,
                            "recall": 0.8414634146341463,
                            "f1": 0.9078947368421053,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.1566265060240964,
                            "true_positive_rate": 0.8313253012048193
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 21,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.24096385542168675,
                            "precision": 0.9523809523809523,
                            "recall": 0.24390243902439024,
                            "f1": 0.3883495145631068,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.7469879518072289,
                            "true_positive_rate": 0.24096385542168675
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 66,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.7831325301204819,
                            "precision": 0.9848484848484849,
                            "recall": 0.7926829268292683,
                            "f1": 0.8783783783783784,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.20481927710843373,
                            "true_positive_rate": 0.7831325301204819
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 2,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.03614457831325301,
                            "precision": 1.0,
                            "recall": 0.024390243902439025,
                            "f1": 0.047619047619047616,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.963855421686747,
                            "true_positive_rate": 0.024096385542168676
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.47289156626506024,
                                "stdev": 0.34251371962989674
                            },
                            "precision": {
                                "average": 0.9807359307359307,
                                "stdev": 0.017441375244955242
                            },
                            "recall": {
                                "average": 0.475609756097561,
                                "stdev": 0.3505965002796354
                            },
                            "f1": {
                                "average": 0.5555604193506595,
                                "stdev": 0.35857873255027567
                            },
                            "true_negative_rate": {
                                "average": 0.0030120481927710845,
                                "stdev": 0.005217020504725535
                            },
                            "false_positive_rate": {
                                "average": 0.009036144578313254,
                                "stdev": 0.005217020504725534
                            },
                            "false_negative_rate": {
                                "average": 0.5180722891566265,
                                "stdev": 0.3463724460593988
                            },
                            "true_positive_rate": {
                                "average": 0.46987951807228917,
                                "stdev": 0.3463724460593988
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 20,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.25301204819277107,
                            "precision": 1.0,
                            "recall": 0.24390243902439024,
                            "f1": 0.39215686274509803,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7469879518072289,
                            "true_positive_rate": 0.24096385542168675
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 21,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.25301204819277107,
                            "precision": 1.0,
                            "recall": 0.25609756097560976,
                            "f1": 0.4077669902912621,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7349397590361446,
                            "true_positive_rate": 0.25301204819277107
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 22,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.27710843373493976,
                            "precision": 1.0,
                            "recall": 0.2682926829268293,
                            "f1": 0.4230769230769231,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7228915662650602,
                            "true_positive_rate": 0.26506024096385544
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 1,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.024096385542168676,
                            "precision": 1.0,
                            "recall": 0.012195121951219513,
                            "f1": 0.024096385542168676,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9759036144578314,
                            "true_positive_rate": 0.012048192771084338
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.20180722891566263,
                                "stdev": 0.10307191947149803
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.1951219512195122,
                                "stdev": 0.1059643121767242
                            },
                            "f1": {
                                "average": 0.311774290413863,
                                "stdev": 0.16645029990468202
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7951807228915663,
                                "stdev": 0.10468763371676368
                            },
                            "true_positive_rate": {
                                "average": 0.1927710843373494,
                                "stdev": 0.10468763371676365
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 78,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.927710843373494,
                            "precision": 0.9871794871794872,
                            "recall": 0.9390243902439024,
                            "f1": 0.9625,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.060240963855421686,
                            "true_positive_rate": 0.927710843373494
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 63,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.7469879518072289,
                            "precision": 0.9841269841269841,
                            "recall": 0.7560975609756098,
                            "f1": 0.8551724137931035,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.24096385542168675,
                            "true_positive_rate": 0.7469879518072289
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 3,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.04819277108433735,
                            "precision": 1.0,
                            "recall": 0.036585365853658534,
                            "f1": 0.07058823529411765,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9518072289156626,
                            "true_positive_rate": 0.03614457831325301
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 0,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.012048192771084338,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9879518072289156,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4337349397590361,
                                "stdev": 0.40884046990258194
                            },
                            "precision": {
                                "average": 0.7428266178266179,
                                "stdev": 0.4289124981240055
                            },
                            "recall": {
                                "average": 0.4329268292682927,
                                "stdev": 0.41984707280736605
                            },
                            "f1": {
                                "average": 0.4720651622718053,
                                "stdev": 0.4391260458432855
                            },
                            "true_negative_rate": {
                                "average": 0.006024096385542169,
                                "stdev": 0.006024096385542169
                            },
                            "false_positive_rate": {
                                "average": 0.006024096385542169,
                                "stdev": 0.006024096385542169
                            },
                            "false_negative_rate": {
                                "average": 0.5602409638554217,
                                "stdev": 0.41478867433980743
                            },
                            "true_positive_rate": {
                                "average": 0.42771084337349397,
                                "stdev": 0.41478867433980743
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 4,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.060240963855421686,
                            "precision": 1.0,
                            "recall": 0.04878048780487805,
                            "f1": 0.09302325581395349,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9397590361445783,
                            "true_positive_rate": 0.04819277108433735
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 1,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.024096385542168676,
                            "precision": 1.0,
                            "recall": 0.012195121951219513,
                            "f1": 0.024096385542168676,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9759036144578314,
                            "true_positive_rate": 0.012048192771084338
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 2,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.03614457831325301,
                            "precision": 1.0,
                            "recall": 0.024390243902439025,
                            "f1": 0.047619047619047616,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.963855421686747,
                            "true_positive_rate": 0.024096385542168676
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 0,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.012048192771084338,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9879518072289156,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.03313253012048193,
                                "stdev": 0.017819517418974747
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.021341463414634144,
                                "stdev": 0.018036828607011026
                            },
                            "f1": {
                                "average": 0.04118467224379244,
                                "stdev": 0.03433957885318252
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9668674698795181,
                                "stdev": 0.01781951741897473
                            },
                            "true_positive_rate": {
                                "average": 0.02108433734939759,
                                "stdev": 0.017819517418974747
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 6,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.08433734939759036,
                            "precision": 1.0,
                            "recall": 0.07317073170731707,
                            "f1": 0.13636363636363635,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9156626506024096,
                            "true_positive_rate": 0.07228915662650602
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 1,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.024096385542168676,
                            "precision": 1.0,
                            "recall": 0.012195121951219513,
                            "f1": 0.024096385542168676,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9759036144578314,
                            "true_positive_rate": 0.012048192771084338
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 1,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.024096385542168676,
                            "precision": 1.0,
                            "recall": 0.012195121951219513,
                            "f1": 0.024096385542168676,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9759036144578314,
                            "true_positive_rate": 0.012048192771084338
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 0,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.012048192771084338,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9879518072289156,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.03614457831325301,
                                "stdev": 0.02825551662544234
                            },
                            "precision": {
                                "average": 0.75,
                                "stdev": 0.4330127018922193
                            },
                            "recall": {
                                "average": 0.024390243902439022,
                                "stdev": 0.028600096096484326
                            },
                            "f1": {
                                "average": 0.04613910186199342,
                                "stdev": 0.05301189968342508
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.963855421686747,
                                "stdev": 0.028255516625442366
                            },
                            "true_positive_rate": {
                                "average": 0.024096385542168672,
                                "stdev": 0.028255516625442345
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 20,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.2289156626506024,
                            "precision": 0.95,
                            "recall": 0.23170731707317074,
                            "f1": 0.37254901960784315,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.7590361445783133,
                            "true_positive_rate": 0.2289156626506024
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 2,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.03614457831325301,
                            "precision": 1.0,
                            "recall": 0.024390243902439025,
                            "f1": 0.047619047619047616,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.963855421686747,
                            "true_positive_rate": 0.024096385542168676
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 13,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.1686746987951807,
                            "precision": 1.0,
                            "recall": 0.15853658536585366,
                            "f1": 0.2736842105263158,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8313253012048193,
                            "true_positive_rate": 0.1566265060240964
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 0,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.012048192771084338,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9879518072289156,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1114457831325301,
                                "stdev": 0.09031123102749969
                            },
                            "precision": {
                                "average": 0.7375,
                                "stdev": 0.4262848226244983
                            },
                            "recall": {
                                "average": 0.10365853658536586,
                                "stdev": 0.09544192586889347
                            },
                            "f1": {
                                "average": 0.17346306943830164,
                                "stdev": 0.1546008141654619
                            },
                            "true_negative_rate": {
                                "average": 0.009036144578313254,
                                "stdev": 0.005217020504725534
                            },
                            "false_positive_rate": {
                                "average": 0.0030120481927710845,
                                "stdev": 0.005217020504725534
                            },
                            "false_negative_rate": {
                                "average": 0.8855421686746988,
                                "stdev": 0.09429202314758149
                            },
                            "true_positive_rate": {
                                "average": 0.10240963855421686,
                                "stdev": 0.09429202314758149
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 13,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.1686746987951807,
                            "precision": 1.0,
                            "recall": 0.15853658536585366,
                            "f1": 0.2736842105263158,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8313253012048193,
                            "true_positive_rate": 0.1566265060240964
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 12,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.1566265060240964,
                            "precision": 1.0,
                            "recall": 0.14634146341463414,
                            "f1": 0.2553191489361702,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8433734939759037,
                            "true_positive_rate": 0.14457831325301204
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 15,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.1927710843373494,
                            "precision": 1.0,
                            "recall": 0.18292682926829268,
                            "f1": 0.30927835051546393,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8072289156626506,
                            "true_positive_rate": 0.18072289156626506
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 6,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.08433734939759036,
                            "precision": 1.0,
                            "recall": 0.07317073170731707,
                            "f1": 0.13636363636363635,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9156626506024096,
                            "true_positive_rate": 0.07228915662650602
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1506024096385542,
                                "stdev": 0.040410867063249215
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.14024390243902438,
                                "stdev": 0.040903682515240056
                            },
                            "f1": {
                                "average": 0.24366133658539657,
                                "stdev": 0.0649146873566675
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8493975903614457,
                                "stdev": 0.04041086706324917
                            },
                            "true_positive_rate": {
                                "average": 0.13855421686746988,
                                "stdev": 0.040410867063249215
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 34,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.42168674698795183,
                            "precision": 1.0,
                            "recall": 0.4146341463414634,
                            "f1": 0.5862068965517241,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5783132530120482,
                            "true_positive_rate": 0.40963855421686746
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 36,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.4457831325301205,
                            "precision": 1.0,
                            "recall": 0.43902439024390244,
                            "f1": 0.6101694915254238,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5542168674698795,
                            "true_positive_rate": 0.43373493975903615
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 30,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.37349397590361444,
                            "precision": 1.0,
                            "recall": 0.36585365853658536,
                            "f1": 0.5357142857142857,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6265060240963856,
                            "true_positive_rate": 0.3614457831325301
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 36,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.4457831325301205,
                            "precision": 1.0,
                            "recall": 0.43902439024390244,
                            "f1": 0.6101694915254238,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5542168674698795,
                            "true_positive_rate": 0.43373493975903615
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4216867469879518,
                                "stdev": 0.029511924611845524
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4146341463414634,
                                "stdev": 0.029871826131502174
                            },
                            "f1": {
                                "average": 0.5855650413292144,
                                "stdev": 0.030398469338198193
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5783132530120482,
                                "stdev": 0.029511924611845524
                            },
                            "true_positive_rate": {
                                "average": 0.4096385542168675,
                                "stdev": 0.029511924611845524
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 83,
                        "prediction_error_num": 50,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.6144578313253012,
                            "precision": 1.0,
                            "recall": 0.6097560975609756,
                            "f1": 0.7575757575757576,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3855421686746988,
                            "true_positive_rate": 0.6024096385542169
                        }
                    },
                    "baseline_errordetection_prompt_2": {
                        "total_num": 83,
                        "prediction_error_num": 58,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.7108433734939759,
                            "precision": 1.0,
                            "recall": 0.7073170731707317,
                            "f1": 0.8285714285714286,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2891566265060241,
                            "true_positive_rate": 0.6987951807228916
                        }
                    },
                    "baseline_errordetection_prompt_3": {
                        "total_num": 83,
                        "prediction_error_num": 44,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.5421686746987951,
                            "precision": 1.0,
                            "recall": 0.5365853658536586,
                            "f1": 0.6984126984126984,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4578313253012048,
                            "true_positive_rate": 0.5301204819277109
                        }
                    },
                    "baseline_errordetection_prompt_4": {
                        "total_num": 83,
                        "prediction_error_num": 42,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.5180722891566265,
                            "precision": 1.0,
                            "recall": 0.5121951219512195,
                            "f1": 0.6774193548387096,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4819277108433735,
                            "true_positive_rate": 0.5060240963855421
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5963855421686748,
                                "stdev": 0.0749993951686068
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5914634146341463,
                                "stdev": 0.07591402193895566
                            },
                            "f1": {
                                "average": 0.7404948098496485,
                                "stdev": 0.05873386303597091
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.40361445783132527,
                                "stdev": 0.07499939516860682
                            },
                            "true_positive_rate": {
                                "average": 0.5843373493975904,
                                "stdev": 0.07499939516860685
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.31626506024096385,
                            "stdev": 0.22968180433983876
                        },
                        "precision": {
                            "average": 0.9025723850369766,
                            "stdev": 0.11930179490707035
                        },
                        "recall": {
                            "average": 0.31125277161862525,
                            "stdev": 0.2344533803752521
                        },
                        "f1": {
                            "average": 0.39997087867231945,
                            "stdev": 0.2561673756628831
                        },
                        "true_negative_rate": {
                            "average": 0.009036144578313253,
                            "stdev": 0.0036326668021417303
                        },
                        "false_positive_rate": {
                            "average": 0.0030120481927710845,
                            "stdev": 0.003632666802141731
                        },
                        "false_negative_rate": {
                            "average": 0.6804490690032858,
                            "stdev": 0.23162864085265872
                        },
                        "true_positive_rate": {
                            "average": 0.30750273822562985,
                            "stdev": 0.23162864085265872
                        }
                    }
                }
            }
        }
    }
}