{
    "math_word_problem_generation": {
        "initial_model=gpt-4-0613": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 63,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.44285714285714284,
                        "precision": 0.5079365079365079,
                        "recall": 0.4155844155844156,
                        "f1": 0.45714285714285713,
                        "true_negative_rate": 0.22857142857142856,
                        "false_positive_rate": 0.22142857142857142,
                        "false_negative_rate": 0.32142857142857145,
                        "true_positive_rate": 0.22857142857142856
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.44285714285714284,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5079365079365079,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.4155844155844156,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.45714285714285713,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.22857142857142856,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.22142857142857142,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.32142857142857145,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.22857142857142856,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 46,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.34285714285714286,
                        "precision": 0.391304347826087,
                        "recall": 0.23376623376623376,
                        "f1": 0.2926829268292683,
                        "true_negative_rate": 0.25,
                        "false_positive_rate": 0.2,
                        "false_negative_rate": 0.42142857142857143,
                        "true_positive_rate": 0.12857142857142856
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.34285714285714286,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.391304347826087,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.23376623376623376,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.2926829268292683,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.25,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.42142857142857143,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.12857142857142856,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 138,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.5507246376811594,
                        "recall": 0.987012987012987,
                        "f1": 0.7069767441860465,
                        "true_negative_rate": 0.007142857142857143,
                        "false_positive_rate": 0.44285714285714284,
                        "false_negative_rate": 0.007142857142857143,
                        "true_positive_rate": 0.5428571428571428
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.55,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5507246376811594,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.987012987012987,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7069767441860465,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.007142857142857143,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.44285714285714284,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.007142857142857143,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.5428571428571428,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 65,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6142857142857143,
                        "precision": 0.676923076923077,
                        "recall": 0.5714285714285714,
                        "f1": 0.6197183098591549,
                        "true_negative_rate": 0.3,
                        "false_positive_rate": 0.15,
                        "false_negative_rate": 0.2357142857142857,
                        "true_positive_rate": 0.3142857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6142857142857143,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.676923076923077,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5714285714285714,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6197183098591549,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.3,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.15,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.2357142857142857,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.3142857142857143,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 93,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5571428571428572,
                        "precision": 0.6021505376344086,
                        "recall": 0.7272727272727273,
                        "f1": 0.6588235294117647,
                        "true_negative_rate": 0.18571428571428572,
                        "false_positive_rate": 0.2642857142857143,
                        "false_negative_rate": 0.15,
                        "true_positive_rate": 0.4
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5571428571428572,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6021505376344086,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.7272727272727273,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6588235294117647,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.18571428571428572,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.2642857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.15,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.4,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 98,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6071428571428571,
                        "precision": 0.6224489795918368,
                        "recall": 0.7922077922077922,
                        "f1": 0.6971428571428572,
                        "true_negative_rate": 0.18571428571428572,
                        "false_positive_rate": 0.2642857142857143,
                        "false_negative_rate": 0.11428571428571428,
                        "true_positive_rate": 0.4357142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6071428571428571,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6224489795918368,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.7922077922077922,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6971428571428572,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.18571428571428572,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.2642857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.11428571428571428,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.4357142857142857,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 104,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6928571428571428,
                        "precision": 0.6634615384615384,
                        "recall": 0.8961038961038961,
                        "f1": 0.7624309392265194,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.25,
                        "false_negative_rate": 0.05714285714285714,
                        "true_positive_rate": 0.4928571428571429
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6928571428571428,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6634615384615384,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8961038961038961,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7624309392265194,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.25,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.05714285714285714,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.4928571428571429,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 35,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.5428571428571428,
                        "precision": 0.6857142857142857,
                        "recall": 0.3116883116883117,
                        "f1": 0.42857142857142855,
                        "true_negative_rate": 0.37142857142857144,
                        "false_positive_rate": 0.07857142857142857,
                        "false_negative_rate": 0.37857142857142856,
                        "true_positive_rate": 0.17142857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5428571428571428,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6857142857142857,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.3116883116883117,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.42857142857142855,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.37142857142857144,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.07857142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.37857142857142856,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.17142857142857143,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 61,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.6714285714285714,
                        "precision": 0.7540983606557377,
                        "recall": 0.5974025974025974,
                        "f1": 0.6666666666666666,
                        "true_negative_rate": 0.34285714285714286,
                        "false_positive_rate": 0.10714285714285714,
                        "false_negative_rate": 0.22142857142857142,
                        "true_positive_rate": 0.32857142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6714285714285714,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7540983606557377,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5974025974025974,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6666666666666666,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.34285714285714286,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.10714285714285714,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.22142857142857142,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.32857142857142857,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 56,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.7642857142857142,
                        "precision": 0.8928571428571429,
                        "recall": 0.6493506493506493,
                        "f1": 0.7518796992481203,
                        "true_negative_rate": 0.40714285714285714,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.19285714285714287,
                        "true_positive_rate": 0.35714285714285715
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.7642857142857142,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8928571428571429,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6493506493506493,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7518796992481203,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.40714285714285714,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.04285714285714286,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.19285714285714287,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.35714285714285715,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 58,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": 0.7071428571428572,
                        "precision": 0.8103448275862069,
                        "recall": 0.6103896103896104,
                        "f1": 0.6962962962962963,
                        "true_negative_rate": 0.37142857142857144,
                        "false_positive_rate": 0.07857142857142857,
                        "false_negative_rate": 0.21428571428571427,
                        "true_positive_rate": 0.3357142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 77,
                    "metrics": {
                        "accuracy": {
                            "average": 0.7071428571428572,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8103448275862069,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6103896103896104,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6962962962962963,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.37142857142857144,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.07857142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.21428571428571427,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.3357142857142857,
                            "stdev": 0.0
                        }
                    }
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 86,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.44375,
                        "precision": 0.686046511627907,
                        "recall": 0.49166666666666664,
                        "f1": 0.5728155339805825,
                        "true_negative_rate": 0.08125,
                        "false_positive_rate": 0.16875,
                        "false_negative_rate": 0.38125,
                        "true_positive_rate": 0.36875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.44375,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.686046511627907,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.49166666666666664,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5728155339805825,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.08125,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.16875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.38125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.36875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 53,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.375,
                        "precision": 0.6981132075471698,
                        "recall": 0.30833333333333335,
                        "f1": 0.4277456647398844,
                        "true_negative_rate": 0.15,
                        "false_positive_rate": 0.1,
                        "false_negative_rate": 0.51875,
                        "true_positive_rate": 0.23125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.375,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6981132075471698,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.30833333333333335,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.4277456647398844,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.15,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.1,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.51875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.23125,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 153,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.78125,
                        "precision": 0.7777777777777778,
                        "recall": 0.9916666666666667,
                        "f1": 0.8717948717948718,
                        "true_negative_rate": 0.0375,
                        "false_positive_rate": 0.2125,
                        "false_negative_rate": 0.00625,
                        "true_positive_rate": 0.74375
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.78125,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7777777777777778,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.9916666666666667,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8717948717948718,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.0375,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.2125,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.00625,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.74375,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 93,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.625,
                        "precision": 0.8494623655913979,
                        "recall": 0.6583333333333333,
                        "f1": 0.7417840375586855,
                        "true_negative_rate": 0.1625,
                        "false_positive_rate": 0.0875,
                        "false_negative_rate": 0.25625,
                        "true_positive_rate": 0.49375
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.625,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8494623655913979,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6583333333333333,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7417840375586855,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.1625,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.25625,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.49375,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 122,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.73125,
                        "precision": 0.8278688524590164,
                        "recall": 0.8416666666666667,
                        "f1": 0.8347107438016529,
                        "true_negative_rate": 0.11875,
                        "false_positive_rate": 0.13125,
                        "false_negative_rate": 0.11875,
                        "true_positive_rate": 0.63125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.73125,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8278688524590164,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8416666666666667,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8347107438016529,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.11875,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.13125,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.11875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.63125,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 104,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.7,
                        "precision": 0.8461538461538461,
                        "recall": 0.7333333333333333,
                        "f1": 0.7857142857142857,
                        "true_negative_rate": 0.15,
                        "false_positive_rate": 0.1,
                        "false_negative_rate": 0.2,
                        "true_positive_rate": 0.55
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.7,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8461538461538461,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.7333333333333333,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7857142857142857,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.15,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.1,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.55,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 113,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.71875,
                        "precision": 0.831858407079646,
                        "recall": 0.7833333333333333,
                        "f1": 0.8068669527896996,
                        "true_negative_rate": 0.13125,
                        "false_positive_rate": 0.11875,
                        "false_negative_rate": 0.1625,
                        "true_positive_rate": 0.5875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.71875,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.831858407079646,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.7833333333333333,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8068669527896996,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.13125,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.11875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.1625,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.5875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 56,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.9285714285714286,
                        "recall": 0.43333333333333335,
                        "f1": 0.5909090909090909,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.425,
                        "true_positive_rate": 0.325
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.55,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9285714285714286,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.43333333333333335,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5909090909090909,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.225,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.025,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.425,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.325,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 112,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.85,
                        "precision": 0.9285714285714286,
                        "recall": 0.8666666666666667,
                        "f1": 0.896551724137931,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.05,
                        "false_negative_rate": 0.1,
                        "true_positive_rate": 0.65
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.85,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9285714285714286,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8666666666666667,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.896551724137931,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.05,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.1,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.65,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 110,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.825,
                        "precision": 0.9181818181818182,
                        "recall": 0.8416666666666667,
                        "f1": 0.8782608695652174,
                        "true_negative_rate": 0.19375,
                        "false_positive_rate": 0.05625,
                        "false_negative_rate": 0.11875,
                        "true_positive_rate": 0.63125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.825,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9181818181818182,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8416666666666667,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8782608695652174,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.19375,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.05625,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.11875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.63125,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 121,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": 0.83125,
                        "precision": 0.8842975206611571,
                        "recall": 0.8916666666666667,
                        "f1": 0.8879668049792531,
                        "true_negative_rate": 0.1625,
                        "false_positive_rate": 0.0875,
                        "false_negative_rate": 0.08125,
                        "true_positive_rate": 0.66875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 120,
                    "metrics": {
                        "accuracy": {
                            "average": 0.83125,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8842975206611571,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8916666666666667,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8879668049792531,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.1625,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.08125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.66875,
                            "stdev": 0.0
                        }
                    }
                }
            }
        }
    },
    "finegrained_fact_verification": {
        "initial_model=gpt-4-0613": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 129,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.55,
                        "precision": 0.5581395348837209,
                        "recall": 0.9230769230769231,
                        "f1": 0.6956521739130435,
                        "true_negative_rate": 0.03571428571428571,
                        "false_positive_rate": 0.40714285714285714,
                        "false_negative_rate": 0.04285714285714286,
                        "true_positive_rate": 0.5142857142857142
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.55,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5581395348837209,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.9230769230769231,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6956521739130435,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.03571428571428571,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.40714285714285714,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.04285714285714286,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.5142857142857142,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 111,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5642857142857143,
                        "precision": 0.5765765765765766,
                        "recall": 0.8205128205128205,
                        "f1": 0.6772486772486772,
                        "true_negative_rate": 0.10714285714285714,
                        "false_positive_rate": 0.3357142857142857,
                        "false_negative_rate": 0.1,
                        "true_positive_rate": 0.45714285714285713
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5642857142857143,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5765765765765766,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8205128205128205,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6772486772486772,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.10714285714285714,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.3357142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.1,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.45714285714285713,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 138,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5571428571428572,
                        "precision": 0.5579710144927537,
                        "recall": 0.9871794871794872,
                        "f1": 0.7129629629629629,
                        "true_negative_rate": 0.007142857142857143,
                        "false_positive_rate": 0.4357142857142857,
                        "false_negative_rate": 0.007142857142857143,
                        "true_positive_rate": 0.55
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5571428571428572,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5579710144927537,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.9871794871794872,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7129629629629629,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.007142857142857143,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.4357142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.007142857142857143,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.55,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 125,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.6142857142857143,
                        "precision": 0.6,
                        "recall": 0.9615384615384616,
                        "f1": 0.7389162561576355,
                        "true_negative_rate": 0.08571428571428572,
                        "false_positive_rate": 0.35714285714285715,
                        "false_negative_rate": 0.02142857142857143,
                        "true_positive_rate": 0.5357142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6142857142857143,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.9615384615384616,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7389162561576355,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.08571428571428572,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.35714285714285715,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.02142857142857143,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.5357142857142857,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 89,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5071428571428571,
                        "precision": 0.550561797752809,
                        "recall": 0.6282051282051282,
                        "f1": 0.5868263473053892,
                        "true_negative_rate": 0.15714285714285714,
                        "false_positive_rate": 0.2857142857142857,
                        "false_negative_rate": 0.20714285714285716,
                        "true_positive_rate": 0.35
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5071428571428571,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.550561797752809,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6282051282051282,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5868263473053892,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.15714285714285714,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.2857142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.20714285714285716,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.35,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 65,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5642857142857143,
                        "precision": 0.6307692307692307,
                        "recall": 0.5256410256410257,
                        "f1": 0.5734265734265734,
                        "true_negative_rate": 0.2714285714285714,
                        "false_positive_rate": 0.17142857142857143,
                        "false_negative_rate": 0.2642857142857143,
                        "true_positive_rate": 0.29285714285714287
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5642857142857143,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6307692307692307,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5256410256410257,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5734265734265734,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2714285714285714,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.17142857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.2642857142857143,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.29285714285714287,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 63,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5642857142857143,
                        "precision": 0.6349206349206349,
                        "recall": 0.5128205128205128,
                        "f1": 0.5673758865248227,
                        "true_negative_rate": 0.2785714285714286,
                        "false_positive_rate": 0.16428571428571428,
                        "false_negative_rate": 0.2714285714285714,
                        "true_positive_rate": 0.2857142857142857
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5642857142857143,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6349206349206349,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5128205128205128,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5673758865248227,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2785714285714286,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.16428571428571428,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.2714285714285714,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.2857142857142857,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 45,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5214285714285715,
                        "precision": 0.6222222222222222,
                        "recall": 0.358974358974359,
                        "f1": 0.45528455284552843,
                        "true_negative_rate": 0.32142857142857145,
                        "false_positive_rate": 0.12142857142857143,
                        "false_negative_rate": 0.35714285714285715,
                        "true_positive_rate": 0.2
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5214285714285715,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6222222222222222,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.358974358974359,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.45528455284552843,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.32142857142857145,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.12142857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.35714285714285715,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 28,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.6071428571428571,
                        "recall": 0.21794871794871795,
                        "f1": 0.32075471698113206,
                        "true_negative_rate": 0.36428571428571427,
                        "false_positive_rate": 0.07857142857142857,
                        "false_negative_rate": 0.4357142857142857,
                        "true_positive_rate": 0.12142857142857143
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4857142857142857,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6071428571428571,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.21794871794871795,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.32075471698113206,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.36428571428571427,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.07857142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.4357142857142857,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.12142857142857143,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 6,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 0.8333333333333334,
                        "recall": 0.0641025641025641,
                        "f1": 0.11904761904761904,
                        "true_negative_rate": 0.4357142857142857,
                        "false_positive_rate": 0.007142857142857143,
                        "false_negative_rate": 0.5214285714285715,
                        "true_positive_rate": 0.03571428571428571
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4714285714285714,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8333333333333334,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.0641025641025641,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.11904761904761904,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.4357142857142857,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.007142857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5214285714285715,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.03571428571428571,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 17,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": 0.5357142857142857,
                        "precision": 0.8823529411764706,
                        "recall": 0.19230769230769232,
                        "f1": 0.3157894736842105,
                        "true_negative_rate": 0.42857142857142855,
                        "false_positive_rate": 0.014285714285714285,
                        "false_negative_rate": 0.45,
                        "true_positive_rate": 0.10714285714285714
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 78,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5357142857142857,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8823529411764706,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.19230769230769232,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.3157894736842105,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.42857142857142855,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.014285714285714285,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.45,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.10714285714285714,
                            "stdev": 0.0
                        }
                    }
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 137,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.69375,
                        "precision": 0.781021897810219,
                        "recall": 0.8492063492063492,
                        "f1": 0.8136882129277566,
                        "true_negative_rate": 0.025,
                        "false_positive_rate": 0.1875,
                        "false_negative_rate": 0.11875,
                        "true_positive_rate": 0.66875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.69375,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.781021897810219,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8492063492063492,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8136882129277566,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.025,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.1875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.11875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.66875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 125,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.75,
                        "precision": 0.848,
                        "recall": 0.8412698412698413,
                        "f1": 0.8446215139442231,
                        "true_negative_rate": 0.09375,
                        "false_positive_rate": 0.11875,
                        "false_negative_rate": 0.125,
                        "true_positive_rate": 0.6625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.75,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.848,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8412698412698413,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8446215139442231,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.09375,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.11875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.6625,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 159,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.78125,
                        "precision": 0.7861635220125787,
                        "recall": 0.9920634920634921,
                        "f1": 0.8771929824561403,
                        "true_negative_rate": 0.0,
                        "false_positive_rate": 0.2125,
                        "false_negative_rate": 0.00625,
                        "true_positive_rate": 0.78125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.78125,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7861635220125787,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.9920634920634921,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8771929824561403,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.2125,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.00625,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.78125,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 82,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.6375,
                        "precision": 0.9146341463414634,
                        "recall": 0.5952380952380952,
                        "f1": 0.7211538461538461,
                        "true_negative_rate": 0.16875,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.31875,
                        "true_positive_rate": 0.46875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6375,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9146341463414634,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5952380952380952,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7211538461538461,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.16875,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.04375,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.31875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.46875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 41,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.3,
                        "precision": 0.6829268292682927,
                        "recall": 0.2222222222222222,
                        "f1": 0.33532934131736525,
                        "true_negative_rate": 0.13125,
                        "false_positive_rate": 0.08125,
                        "false_negative_rate": 0.6125,
                        "true_positive_rate": 0.175
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6829268292682927,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.2222222222222222,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.33532934131736525,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.13125,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.08125,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.175,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 48,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.4875,
                        "precision": 0.9583333333333334,
                        "recall": 0.36507936507936506,
                        "f1": 0.5287356321839081,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.5,
                        "true_positive_rate": 0.2875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4875,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9583333333333334,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.36507936507936506,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5287356321839081,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0125,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.2875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 28,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.3875,
                        "precision": 1.0,
                        "recall": 0.2222222222222222,
                        "f1": 0.36363636363636365,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.6125,
                        "true_positive_rate": 0.175
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3875,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.2222222222222222,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.36363636363636365,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2125,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.175,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 48,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.4625,
                        "precision": 0.9166666666666666,
                        "recall": 0.3492063492063492,
                        "f1": 0.5057471264367817,
                        "true_negative_rate": 0.1875,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.5125,
                        "true_positive_rate": 0.275
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4625,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9166666666666666,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.3492063492063492,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5057471264367817,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.1875,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.025,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.275,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 64,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.6125,
                        "precision": 1.0,
                        "recall": 0.5079365079365079,
                        "f1": 0.6736842105263158,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.3875,
                        "true_positive_rate": 0.4
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6125,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5079365079365079,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6736842105263158,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2125,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.3875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.4,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 33,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.38125,
                        "precision": 0.9090909090909091,
                        "recall": 0.23809523809523808,
                        "f1": 0.37735849056603776,
                        "true_negative_rate": 0.19375,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.6,
                        "true_positive_rate": 0.1875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.38125,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9090909090909091,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.23809523809523808,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.37735849056603776,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.19375,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.01875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.1875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 72,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": 0.5625,
                        "precision": 0.8888888888888888,
                        "recall": 0.5079365079365079,
                        "f1": 0.6464646464646465,
                        "true_negative_rate": 0.1625,
                        "false_positive_rate": 0.05,
                        "false_negative_rate": 0.3875,
                        "true_positive_rate": 0.4
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 126,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5625,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8888888888888888,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5079365079365079,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6464646464646465,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.1625,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.05,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.3875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.4,
                            "stdev": 0.0
                        }
                    }
                }
            }
        }
    },
    "answerability_classification": {
        "initial_model=gpt-4-0613": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 112,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5571428571428572,
                        "precision": 0.5892857142857143,
                        "recall": 0.8148148148148148,
                        "f1": 0.6839378238341969,
                        "true_negative_rate": 0.09285714285714286,
                        "false_positive_rate": 0.32857142857142857,
                        "false_negative_rate": 0.10714285714285714,
                        "true_positive_rate": 0.4714285714285714
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5571428571428572,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5892857142857143,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8148148148148148,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.6839378238341969,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.09285714285714286,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.32857142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.10714285714285714,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.4714285714285714,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 123,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5714285714285714,
                        "precision": 0.5853658536585366,
                        "recall": 0.8888888888888888,
                        "f1": 0.7058823529411765,
                        "true_negative_rate": 0.05714285714285714,
                        "false_positive_rate": 0.36428571428571427,
                        "false_negative_rate": 0.06428571428571428,
                        "true_positive_rate": 0.5142857142857142
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5714285714285714,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5853658536585366,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8888888888888888,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7058823529411765,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.05714285714285714,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.36428571428571427,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.06428571428571428,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.5142857142857142,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5785714285714286,
                        "precision": 0.5785714285714286,
                        "recall": 1.0,
                        "f1": 0.7330316742081447,
                        "true_negative_rate": 0.0,
                        "false_positive_rate": 0.42142857142857143,
                        "false_negative_rate": 0.0,
                        "true_positive_rate": 0.5785714285714286
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5785714285714286,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5785714285714286,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7330316742081447,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.42142857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.5785714285714286,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 43,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4785714285714286,
                        "precision": 0.6511627906976745,
                        "recall": 0.345679012345679,
                        "f1": 0.45161290322580644,
                        "true_negative_rate": 0.3142857142857143,
                        "false_positive_rate": 0.10714285714285714,
                        "false_negative_rate": 0.37857142857142856,
                        "true_positive_rate": 0.2
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4785714285714286,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6511627906976745,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.345679012345679,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.45161290322580644,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.3142857142857143,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.10714285714285714,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.37857142857142856,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 95,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5142857142857142,
                        "precision": 0.5789473684210527,
                        "recall": 0.6790123456790124,
                        "f1": 0.625,
                        "true_negative_rate": 0.1357142857142857,
                        "false_positive_rate": 0.2857142857142857,
                        "false_negative_rate": 0.18571428571428572,
                        "true_positive_rate": 0.39285714285714285
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5142857142857142,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.5789473684210527,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6790123456790124,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.625,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.1357142857142857,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.2857142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.18571428571428572,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.39285714285714285,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 19,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 0.6842105263157895,
                        "recall": 0.16049382716049382,
                        "f1": 0.26,
                        "true_negative_rate": 0.37857142857142856,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.4857142857142857,
                        "true_positive_rate": 0.09285714285714286
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4714285714285714,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6842105263157895,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.16049382716049382,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.26,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.37857142857142856,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.04285714285714286,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.4857142857142857,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.09285714285714286,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 21,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4857142857142857,
                        "precision": 0.7142857142857143,
                        "recall": 0.18518518518518517,
                        "f1": 0.29411764705882354,
                        "true_negative_rate": 0.37857142857142856,
                        "false_positive_rate": 0.04285714285714286,
                        "false_negative_rate": 0.4714285714285714,
                        "true_positive_rate": 0.10714285714285714
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4857142857142857,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7142857142857143,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.18518518518518517,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.29411764705882354,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.37857142857142856,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.04285714285714286,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.4714285714285714,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.10714285714285714,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 29,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 0.6206896551724138,
                        "recall": 0.2222222222222222,
                        "f1": 0.32727272727272727,
                        "true_negative_rate": 0.34285714285714286,
                        "false_positive_rate": 0.07857142857142857,
                        "false_negative_rate": 0.45,
                        "true_positive_rate": 0.12857142857142856
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4714285714285714,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6206896551724138,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.2222222222222222,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.32727272727272727,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.34285714285714286,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.07857142857142857,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.45,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.12857142857142856,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 31,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5,
                        "precision": 0.6774193548387096,
                        "recall": 0.25925925925925924,
                        "f1": 0.375,
                        "true_negative_rate": 0.35,
                        "false_positive_rate": 0.07142857142857142,
                        "false_negative_rate": 0.42857142857142855,
                        "true_positive_rate": 0.15
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.6774193548387096,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.25925925925925924,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.375,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.35,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.07142857142857142,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.42857142857142855,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.15,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 11,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.4714285714285714,
                        "precision": 0.8181818181818182,
                        "recall": 0.1111111111111111,
                        "f1": 0.1956521739130435,
                        "true_negative_rate": 0.40714285714285714,
                        "false_positive_rate": 0.014285714285714285,
                        "false_negative_rate": 0.5142857142857142,
                        "true_positive_rate": 0.06428571428571428
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4714285714285714,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8181818181818182,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.1111111111111111,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.1956521739130435,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.40714285714285714,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.014285714285714285,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5142857142857142,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.06428571428571428,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 140,
                    "prediction_error_num": 19,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": 0.5142857142857142,
                        "precision": 0.8421052631578947,
                        "recall": 0.19753086419753085,
                        "f1": 0.32,
                        "true_negative_rate": 0.4,
                        "false_positive_rate": 0.02142857142857143,
                        "false_negative_rate": 0.4642857142857143,
                        "true_positive_rate": 0.11428571428571428
                    }
                },
                "average": {
                    "total_num": 140,
                    "gold_error_num": 81,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5142857142857142,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8421052631578947,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.19753086419753085,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.32,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.4,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.02142857142857143,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.4642857142857143,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.11428571428571428,
                            "stdev": 0.0
                        }
                    }
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=google/gemma-7b-it": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 58,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.3875,
                        "precision": 0.7241379310344828,
                        "recall": 0.3387096774193548,
                        "f1": 0.46153846153846156,
                        "true_negative_rate": 0.125,
                        "false_positive_rate": 0.1,
                        "false_negative_rate": 0.5125,
                        "true_positive_rate": 0.2625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3875,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7241379310344828,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.3387096774193548,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.46153846153846156,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.125,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.1,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.2625,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 119,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.66875,
                        "precision": 0.7983193277310925,
                        "recall": 0.7661290322580645,
                        "f1": 0.7818930041152263,
                        "true_negative_rate": 0.075,
                        "false_positive_rate": 0.15,
                        "false_negative_rate": 0.18125,
                        "true_positive_rate": 0.59375
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.66875,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7983193277310925,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.7661290322580645,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7818930041152263,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.075,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.15,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.18125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.59375,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 157,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.76875,
                        "precision": 0.7770700636942676,
                        "recall": 0.9838709677419355,
                        "f1": 0.8683274021352313,
                        "true_negative_rate": 0.00625,
                        "false_positive_rate": 0.21875,
                        "false_negative_rate": 0.0125,
                        "true_positive_rate": 0.7625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.76875,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7770700636942676,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.9838709677419355,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.8683274021352313,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.00625,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.21875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.0125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.7625,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 18,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.26875,
                        "precision": 0.7777777777777778,
                        "recall": 0.11290322580645161,
                        "f1": 0.19718309859154928,
                        "true_negative_rate": 0.2,
                        "false_positive_rate": 0.025,
                        "false_negative_rate": 0.6875,
                        "true_positive_rate": 0.0875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.26875,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7777777777777778,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.11290322580645161,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.19718309859154928,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.025,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.0875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 49,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.3875,
                        "precision": 0.7755102040816326,
                        "recall": 0.3064516129032258,
                        "f1": 0.4393063583815029,
                        "true_negative_rate": 0.15625,
                        "false_positive_rate": 0.06875,
                        "false_negative_rate": 0.5375,
                        "true_positive_rate": 0.2375
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3875,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.7755102040816326,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.3064516129032258,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.4393063583815029,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.15625,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.06875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5375,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.2375,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 20,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.3375,
                        "precision": 0.95,
                        "recall": 0.1532258064516129,
                        "f1": 0.2638888888888889,
                        "true_negative_rate": 0.21875,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.65625,
                        "true_positive_rate": 0.11875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.3375,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.95,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.1532258064516129,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.2638888888888889,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.21875,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.00625,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.65625,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.11875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 4,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.2375,
                        "precision": 0.75,
                        "recall": 0.024193548387096774,
                        "f1": 0.046875,
                        "true_negative_rate": 0.21875,
                        "false_positive_rate": 0.00625,
                        "false_negative_rate": 0.75625,
                        "true_positive_rate": 0.01875
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.2375,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.75,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.024193548387096774,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.046875,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.21875,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.00625,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.75625,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.01875,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 12,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.275,
                        "precision": 0.8333333333333334,
                        "recall": 0.08064516129032258,
                        "f1": 0.14705882352941177,
                        "true_negative_rate": 0.2125,
                        "false_positive_rate": 0.0125,
                        "false_negative_rate": 0.7125,
                        "true_positive_rate": 0.0625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.275,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.8333333333333334,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.08064516129032258,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.14705882352941177,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.2125,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0125,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.7125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.0625,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 17,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.33125,
                        "precision": 1.0,
                        "recall": 0.13709677419354838,
                        "f1": 0.24113475177304963,
                        "true_negative_rate": 0.225,
                        "false_positive_rate": 0.0,
                        "false_negative_rate": 0.66875,
                        "true_positive_rate": 0.10625
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.33125,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.13709677419354838,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.24113475177304963,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.225,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.66875,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.10625,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 56,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.5375,
                        "precision": 0.9464285714285714,
                        "recall": 0.4274193548387097,
                        "f1": 0.5888888888888889,
                        "true_negative_rate": 0.20625,
                        "false_positive_rate": 0.01875,
                        "false_negative_rate": 0.44375,
                        "true_positive_rate": 0.33125
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5375,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9464285714285714,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.4274193548387097,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.5888888888888889,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.20625,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.01875,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.44375,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.33125,
                            "stdev": 0.0
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "prompt=cot_instruction_prompt": {
                    "total_num": 160,
                    "prediction_error_num": 86,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": 0.675,
                        "precision": 0.9186046511627907,
                        "recall": 0.6370967741935484,
                        "f1": 0.7523809523809524,
                        "true_negative_rate": 0.18125,
                        "false_positive_rate": 0.04375,
                        "false_negative_rate": 0.28125,
                        "true_positive_rate": 0.49375
                    }
                },
                "average": {
                    "total_num": 160,
                    "gold_error_num": 124,
                    "metrics": {
                        "accuracy": {
                            "average": 0.675,
                            "stdev": 0.0
                        },
                        "precision": {
                            "average": 0.9186046511627907,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6370967741935484,
                            "stdev": 0.0
                        },
                        "f1": {
                            "average": 0.7523809523809524,
                            "stdev": 0.0
                        },
                        "true_negative_rate": {
                            "average": 0.18125,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.04375,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.28125,
                            "stdev": 0.0
                        },
                        "true_positive_rate": {
                            "average": 0.49375,
                            "stdev": 0.0
                        }
                    }
                }
            }
        }
    },
    "average": {
        "initial_model=gpt-4-0613": {
            "baseline_model=google/gemma-7b-it": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5166666666666667,
                            "stdev": 0.052272615238333715
                        },
                        "precision": {
                            "average": 0.5517872523686478,
                            "stdev": 0.0335130518184077
                        },
                        "recall": {
                            "average": 0.7178253844920511,
                            "stdev": 0.21823897220861488
                        },
                        "f1": {
                            "average": 0.6122442849633658,
                            "stdev": 0.10977749066000975
                        },
                        "true_negative_rate": {
                            "average": 0.11904761904761905,
                            "stdev": 0.08088232262601441
                        },
                        "false_positive_rate": {
                            "average": 0.319047619047619,
                            "stdev": 0.07611603506259897
                        },
                        "false_negative_rate": {
                            "average": 0.15714285714285717,
                            "stdev": 0.1190952285752362
                        },
                        "true_positive_rate": {
                            "average": 0.4047619047619047,
                            "stdev": 0.12580804584402766
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4928571428571429,
                            "stdev": 0.10610609502455803
                        },
                        "precision": {
                            "average": 0.5177489260204,
                            "stdev": 0.08948179093707345
                        },
                        "recall": {
                            "average": 0.6477226477226478,
                            "stdev": 0.29403940345173996
                        },
                        "f1": {
                            "average": 0.5586046523397074,
                            "stdev": 0.1883980624807534
                        },
                        "true_negative_rate": {
                            "average": 0.1380952380952381,
                            "stdev": 0.08171905857966912
                        },
                        "false_positive_rate": {
                            "average": 0.3,
                            "stdev": 0.07166627115853527
                        },
                        "false_negative_rate": {
                            "average": 0.19523809523809524,
                            "stdev": 0.16060401860990525
                        },
                        "true_positive_rate": {
                            "average": 0.36666666666666664,
                            "stdev": 0.16996731711975951
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.561904761904762,
                            "stdev": 0.012140522651411401
                        },
                        "precision": {
                            "average": 0.5624223602484473,
                            "stdev": 0.011796095399160503
                        },
                        "recall": {
                            "average": 0.9913974913974913,
                            "stdev": 0.006083271941345209
                        },
                        "f1": {
                            "average": 0.7176571271190514,
                            "stdev": 0.011142747340978898
                        },
                        "true_negative_rate": {
                            "average": 0.0047619047619047615,
                            "stdev": 0.003367175148507369
                        },
                        "false_positive_rate": {
                            "average": 0.43333333333333335,
                            "stdev": 0.008908708063747472
                        },
                        "false_negative_rate": {
                            "average": 0.0047619047619047615,
                            "stdev": 0.003367175148507369
                        },
                        "true_positive_rate": {
                            "average": 0.5571428571428573,
                            "stdev": 0.015430334996209221
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5690476190476191,
                            "stdev": 0.06397632782164002
                        },
                        "precision": {
                            "average": 0.6426952892069171,
                            "stdev": 0.03196940075499683
                        },
                        "recall": {
                            "average": 0.6262153484375707,
                            "stdev": 0.2543906556083704
                        },
                        "f1": {
                            "average": 0.6034158230808656,
                            "stdev": 0.11785621878044167
                        },
                        "true_negative_rate": {
                            "average": 0.2333333333333333,
                            "stdev": 0.10454523047666738
                        },
                        "false_positive_rate": {
                            "average": 0.20476190476190473,
                            "stdev": 0.10916088939378717
                        },
                        "false_negative_rate": {
                            "average": 0.2119047619047619,
                            "stdev": 0.1467717619754518
                        },
                        "true_positive_rate": {
                            "average": 0.35000000000000003,
                            "stdev": 0.13936200596820852
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5261904761904762,
                            "stdev": 0.022080044036894557
                        },
                        "precision": {
                            "average": 0.5772199012694235,
                            "stdev": 0.0210964077149453
                        },
                        "recall": {
                            "average": 0.6781634003856226,
                            "stdev": 0.040448632670240056
                        },
                        "f1": {
                            "average": 0.623549958905718,
                            "stdev": 0.029410604914961988
                        },
                        "true_negative_rate": {
                            "average": 0.1595238095238095,
                            "stdev": 0.020481726826291975
                        },
                        "false_positive_rate": {
                            "average": 0.2785714285714286,
                            "stdev": 0.010101525445522098
                        },
                        "false_negative_rate": {
                            "average": 0.18095238095238098,
                            "stdev": 0.02357022603955159
                        },
                        "true_positive_rate": {
                            "average": 0.38095238095238093,
                            "stdev": 0.022080044036894547
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5476190476190476,
                            "stdev": 0.056644653587477695
                        },
                        "precision": {
                            "average": 0.6458095788922856,
                            "stdev": 0.0273651995203246
                        },
                        "recall": {
                            "average": 0.4927808816697706,
                            "stdev": 0.2589407592549611
                        },
                        "f1": {
                            "average": 0.5101898101898102,
                            "stdev": 0.18397941108767568
                        },
                        "true_negative_rate": {
                            "average": 0.2785714285714285,
                            "stdev": 0.07889543583705186
                        },
                        "false_positive_rate": {
                            "average": 0.15952380952380954,
                            "stdev": 0.09078893316636961
                        },
                        "false_negative_rate": {
                            "average": 0.2880952380952381,
                            "stdev": 0.15256685089143615
                        },
                        "true_positive_rate": {
                            "average": 0.27380952380952384,
                            "stdev": 0.14061736247841336
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.580952380952381,
                            "stdev": 0.08538295750971955
                        },
                        "precision": {
                            "average": 0.6708892958892959,
                            "stdev": 0.03282359558773275
                        },
                        "recall": {
                            "average": 0.531369864703198,
                            "stdev": 0.29052758013084995
                        },
                        "f1": {
                            "average": 0.5413081576033886,
                            "stdev": 0.1920746031686255
                        },
                        "true_negative_rate": {
                            "average": 0.28571428571428575,
                            "stdev": 0.07307623453915622
                        },
                        "false_positive_rate": {
                            "average": 0.15238095238095237,
                            "stdev": 0.08498365855987976
                        },
                        "false_negative_rate": {
                            "average": 0.26666666666666666,
                            "stdev": 0.16916494922220582
                        },
                        "true_positive_rate": {
                            "average": 0.29523809523809524,
                            "stdev": 0.15761113488986064
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5119047619047619,
                            "stdev": 0.02992810735708698
                        },
                        "precision": {
                            "average": 0.6428753877029739,
                            "stdev": 0.03029813610942891
                        },
                        "recall": {
                            "average": 0.2976282976282976,
                            "stdev": 0.05670714077959156
                        },
                        "f1": {
                            "average": 0.40370956956322807,
                            "stdev": 0.05513825590657269
                        },
                        "true_negative_rate": {
                            "average": 0.3452380952380952,
                            "stdev": 0.020481726826291964
                        },
                        "false_positive_rate": {
                            "average": 0.09285714285714286,
                            "stdev": 0.020203050891044214
                        },
                        "false_negative_rate": {
                            "average": 0.3952380952380952,
                            "stdev": 0.039698409525078734
                        },
                        "true_positive_rate": {
                            "average": 0.16666666666666666,
                            "stdev": 0.029354352395090374
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5523809523809523,
                            "stdev": 0.08438116736509212
                        },
                        "precision": {
                            "average": 0.6795535242124348,
                            "stdev": 0.06001330970358643
                        },
                        "recall": {
                            "average": 0.3582035248701915,
                            "stdev": 0.16997801314898514
                        },
                        "f1": {
                            "average": 0.45414046121593293,
                            "stdev": 0.15190167584090897
                        },
                        "true_negative_rate": {
                            "average": 0.3523809523809523,
                            "stdev": 0.008908708063747472
                        },
                        "false_positive_rate": {
                            "average": 0.0857142857142857,
                            "stdev": 0.01543033499620919
                        },
                        "false_negative_rate": {
                            "average": 0.3619047619047619,
                            "stdev": 0.09937446073671051
                        },
                        "true_positive_rate": {
                            "average": 0.19999999999999998,
                            "stdev": 0.09165893597581028
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5690476190476191,
                            "stdev": 0.1380541810888021
                        },
                        "precision": {
                            "average": 0.8481240981240982,
                            "stdev": 0.03223017293534207
                        },
                        "recall": {
                            "average": 0.2748547748547749,
                            "stdev": 0.26550307077167806
                        },
                        "f1": {
                            "average": 0.3555264974029276,
                            "stdev": 0.2820034986972496
                        },
                        "true_negative_rate": {
                            "average": 0.4166666666666667,
                            "stdev": 0.01346870059402948
                        },
                        "false_positive_rate": {
                            "average": 0.021428571428571432,
                            "stdev": 0.015430334996209192
                        },
                        "false_negative_rate": {
                            "average": 0.40952380952380957,
                            "stdev": 0.15323421818765795
                        },
                        "true_positive_rate": {
                            "average": 0.15238095238095237,
                            "stdev": 0.14525760993018708
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "average": {
                    "total_num": 420,
                    "gold_error_num": 236,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5857142857142857,
                            "stdev": 0.08630747123996127
                        },
                        "precision": {
                            "average": 0.8449343439735241,
                            "stdev": 0.02946517582280368
                        },
                        "recall": {
                            "average": 0.3334093889649445,
                            "stdev": 0.19586620037922214
                        },
                        "f1": {
                            "average": 0.4440285899935023,
                            "stdev": 0.1783884877926423
                        },
                        "true_negative_rate": {
                            "average": 0.4000000000000001,
                            "stdev": 0.02332847374079216
                        },
                        "false_positive_rate": {
                            "average": 0.0380952380952381,
                            "stdev": 0.028769157079987076
                        },
                        "false_negative_rate": {
                            "average": 0.3761904761904762,
                            "stdev": 0.11463241064010214
                        },
                        "true_positive_rate": {
                            "average": 0.18571428571428572,
                            "stdev": 0.10610609502455803
                        }
                    }
                }
            }
        },
        "initial_model=meta-llama/Llama-2-70b-chat-hf": {
            "baseline_model=google/gemma-7b-it": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5083333333333333,
                            "stdev": 0.13310527370802738
                        },
                        "precision": {
                            "average": 0.7304021134908695,
                            "stdev": 0.03902572639943218
                        },
                        "recall": {
                            "average": 0.5598608977641235,
                            "stdev": 0.21391517453207098
                        },
                        "f1": {
                            "average": 0.6160140694822669,
                            "stdev": 0.14697380027973986
                        },
                        "true_negative_rate": {
                            "average": 0.07708333333333334,
                            "stdev": 0.04093100563414271
                        },
                        "false_positive_rate": {
                            "average": 0.15208333333333335,
                            "stdev": 0.037615562677641226
                        },
                        "false_negative_rate": {
                            "average": 0.33749999999999997,
                            "stdev": 0.16369751067135993
                        },
                        "true_positive_rate": {
                            "average": 0.43333333333333335,
                            "stdev": 0.17202329461119176
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5979166666666667,
                            "stdev": 0.16107818150064757
                        },
                        "precision": {
                            "average": 0.7814775117594207,
                            "stdev": 0.062339114055661685
                        },
                        "recall": {
                            "average": 0.6385774022870797,
                            "stdev": 0.2355240880873705
                        },
                        "f1": {
                            "average": 0.6847533942664447,
                            "stdev": 0.18352737536698221
                        },
                        "true_negative_rate": {
                            "average": 0.10625,
                            "stdev": 0.0318688719599549
                        },
                        "false_positive_rate": {
                            "average": 0.12291666666666667,
                            "stdev": 0.020623947784607632
                        },
                        "false_negative_rate": {
                            "average": 0.275,
                            "stdev": 0.17388034679054445
                        },
                        "true_positive_rate": {
                            "average": 0.49583333333333335,
                            "stdev": 0.18918226807910818
                        }
                    }
                }
            },
            "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.7770833333333332,
                            "stdev": 0.005892556509887875
                        },
                        "precision": {
                            "average": 0.7803371211615412,
                            "stdev": 0.004130006050336652
                        },
                        "recall": {
                            "average": 0.9892003754906981,
                            "stdev": 0.0037719409515808984
                        },
                        "f1": {
                            "average": 0.8724384187954145,
                            "stdev": 0.0036478526252260476
                        },
                        "true_negative_rate": {
                            "average": 0.014583333333333332,
                            "stdev": 0.01640418307085794
                        },
                        "false_positive_rate": {
                            "average": 0.21458333333333335,
                            "stdev": 0.0029462782549439506
                        },
                        "false_negative_rate": {
                            "average": 0.008333333333333333,
                            "stdev": 0.0029462782549439484
                        },
                        "true_positive_rate": {
                            "average": 0.7624999999999998,
                            "stdev": 0.015309310892394854
                        }
                    }
                }
            },
            "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5104166666666666,
                            "stdev": 0.1709603186577387
                        },
                        "precision": {
                            "average": 0.8472914299035463,
                            "stdev": 0.05589246297090173
                        },
                        "recall": {
                            "average": 0.4554915514592934,
                            "stdev": 0.2436121547635187
                        },
                        "f1": {
                            "average": 0.553373660768027,
                            "stdev": 0.2520055404642875
                        },
                        "true_negative_rate": {
                            "average": 0.17708333333333334,
                            "stdev": 0.01640418307085794
                        },
                        "false_positive_rate": {
                            "average": 0.05208333333333332,
                            "stdev": 0.026187093937451115
                        },
                        "false_negative_rate": {
                            "average": 0.42083333333333334,
                            "stdev": 0.190280312229674
                        },
                        "true_positive_rate": {
                            "average": 0.35000000000000003,
                            "stdev": 0.18589591621836848
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.47291666666666665,
                            "stdev": 0.1861292487016004
                        },
                        "precision": {
                            "average": 0.762101961936314,
                            "stdev": 0.05992708480437259
                        },
                        "recall": {
                            "average": 0.4567801672640382,
                            "stdev": 0.27431959539975803
                        },
                        "f1": {
                            "average": 0.5364488145001737,
                            "stdev": 0.2151324225872202
                        },
                        "true_negative_rate": {
                            "average": 0.13541666666666666,
                            "stdev": 0.015590239111558091
                        },
                        "false_positive_rate": {
                            "average": 0.09375,
                            "stdev": 0.027003086243366083
                        },
                        "false_negative_rate": {
                            "average": 0.42291666666666666,
                            "stdev": 0.21724681943714516
                        },
                        "true_positive_rate": {
                            "average": 0.34791666666666665,
                            "stdev": 0.2019651716399527
                        }
                    }
                }
            },
            "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5083333333333333,
                            "stdev": 0.14872140247978952
                        },
                        "precision": {
                            "average": 0.9181623931623931,
                            "stdev": 0.051031259980673975
                        },
                        "recall": {
                            "average": 0.4172128349547704,
                            "stdev": 0.23967979986799587
                        },
                        "f1": {
                            "average": 0.5261129355956942,
                            "stdev": 0.21304239812150239
                        },
                        "true_negative_rate": {
                            "average": 0.18958333333333333,
                            "stdev": 0.02901747557746692
                        },
                        "false_positive_rate": {
                            "average": 0.03958333333333334,
                            "stdev": 0.042797163717029456
                        },
                        "false_negative_rate": {
                            "average": 0.45208333333333334,
                            "stdev": 0.1893198718806055
                        },
                        "true_positive_rate": {
                            "average": 0.31875000000000003,
                            "stdev": 0.1774383695822299
                        }
                    }
                }
            },
            "baseline_model=gpt-3.5-turbo-0125": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4479166666666667,
                            "stdev": 0.20106055610741314
                        },
                        "precision": {
                            "average": 0.8606194690265486,
                            "stdev": 0.10406856397982751
                        },
                        "recall": {
                            "average": 0.34324970131421745,
                            "stdev": 0.3215162425068414
                        },
                        "f1": {
                            "average": 0.4057927721420211,
                            "stdev": 0.31169409593365427
                        },
                        "true_negative_rate": {
                            "average": 0.1875,
                            "stdev": 0.039856513478560396
                        },
                        "false_positive_rate": {
                            "average": 0.041666666666666664,
                            "stdev": 0.054565836890453315
                        },
                        "false_negative_rate": {
                            "average": 0.5104166666666666,
                            "stdev": 0.2529170098844459
                        },
                        "true_positive_rate": {
                            "average": 0.2604166666666667,
                            "stdev": 0.2399182441768214
                        }
                    }
                }
            },
            "baseline_model=models/gemini-1.0-pro-001": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.4291666666666667,
                            "stdev": 0.11471583248280172
                        },
                        "precision": {
                            "average": 0.8928571428571429,
                            "stdev": 0.04236935814298138
                        },
                        "recall": {
                            "average": 0.2877282812766684,
                            "stdev": 0.15040368191866169
                        },
                        "f1": {
                            "average": 0.41457168029176145,
                            "stdev": 0.1923286881843351
                        },
                        "true_negative_rate": {
                            "average": 0.20833333333333334,
                            "stdev": 0.015590239111558091
                        },
                        "false_positive_rate": {
                            "average": 0.020833333333333332,
                            "stdev": 0.005892556509887897
                        },
                        "false_negative_rate": {
                            "average": 0.5499999999999999,
                            "stdev": 0.12032940898494712
                        },
                        "true_positive_rate": {
                            "average": 0.22083333333333335,
                            "stdev": 0.11380416903123053
                        }
                    }
                }
            },
            "baseline_model=claude-3-opus-20240229": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5979166666666667,
                            "stdev": 0.21202970834819876
                        },
                        "precision": {
                            "average": 0.9761904761904763,
                            "stdev": 0.033671751485073675
                        },
                        "recall": {
                            "average": 0.503899982932241,
                            "stdev": 0.29785933721740976
                        },
                        "f1": {
                            "average": 0.603790228812432,
                            "stdev": 0.27209891390920377
                        },
                        "true_negative_rate": {
                            "average": 0.2125,
                            "stdev": 0.010206207261596573
                        },
                        "false_positive_rate": {
                            "average": 0.016666666666666666,
                            "stdev": 0.023570226039551587
                        },
                        "false_negative_rate": {
                            "average": 0.3854166666666667,
                            "stdev": 0.23219588832611904
                        },
                        "true_positive_rate": {
                            "average": 0.3854166666666667,
                            "stdev": 0.22222439235051483
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0613": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.5812499999999999,
                            "stdev": 0.18378259348117454
                        },
                        "precision": {
                            "average": 0.9245670995670995,
                            "stdev": 0.015897675322453855
                        },
                        "recall": {
                            "average": 0.5023937532002049,
                            "stdev": 0.2520456144618667
                        },
                        "f1": {
                            "average": 0.6148360830067148,
                            "stdev": 0.20531397254562836
                        },
                        "true_negative_rate": {
                            "average": 0.19791666666666666,
                            "stdev": 0.005892556509887888
                        },
                        "false_positive_rate": {
                            "average": 0.03125,
                            "stdev": 0.01767766952966369
                        },
                        "false_negative_rate": {
                            "average": 0.3875,
                            "stdev": 0.2004552111237487
                        },
                        "true_positive_rate": {
                            "average": 0.3833333333333333,
                            "stdev": 0.18486575459565846
                        }
                    }
                }
            },
            "baseline_model=gpt-4-0125-preview": {
                "average": {
                    "total_num": 480,
                    "gold_error_num": 370,
                    "metrics": {
                        "accuracy": {
                            "average": 0.6895833333333333,
                            "stdev": 0.11020025962865163
                        },
                        "precision": {
                            "average": 0.8972636869042788,
                            "stdev": 0.015206308606692703
                        },
                        "recall": {
                            "average": 0.678899982932241,
                            "stdev": 0.1594215373894135
                        },
                        "f1": {
                            "average": 0.7622708012749507,
                            "stdev": 0.09884054490327043
                        },
                        "true_negative_rate": {
                            "average": 0.16874999999999998,
                            "stdev": 0.008838834764831839
                        },
                        "false_positive_rate": {
                            "average": 0.060416666666666674,
                            "stdev": 0.019320038532282712
                        },
                        "false_negative_rate": {
                            "average": 0.25,
                            "stdev": 0.1269637415432716
                        },
                        "true_positive_rate": {
                            "average": 0.5208333333333334,
                            "stdev": 0.1113755533718638
                        }
                    }
                }
            }
        }
    }
}