{
    "math_word_problem_generation": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 12,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.42857142857142855,
                            "precision": 1.0,
                            "recall": 0.42857142857142855,
                            "f1": 0.6,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5714285714285714,
                            "true_positive_rate": 0.42857142857142855
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.42857142857142855,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.42857142857142855,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5714285714285714,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.42857142857142855,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 8,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.2857142857142857,
                            "precision": 1.0,
                            "recall": 0.2857142857142857,
                            "f1": 0.4444444444444444,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7142857142857143,
                            "true_positive_rate": 0.2857142857142857
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2857142857142857,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2857142857142857,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4444444444444444,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7142857142857143,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2857142857142857,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 18,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.6428571428571429,
                            "precision": 1.0,
                            "recall": 0.6428571428571429,
                            "f1": 0.782608695652174,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.35714285714285715,
                            "true_positive_rate": 0.6428571428571429
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6428571428571429,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6428571428571429,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.782608695652174,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.35714285714285715,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.6428571428571429,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 21,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.75,
                            "precision": 1.0,
                            "recall": 0.75,
                            "f1": 0.8571428571428571,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25,
                            "true_positive_rate": 0.75
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 24,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8571428571428571,
                            "precision": 1.0,
                            "recall": 0.8571428571428571,
                            "f1": 0.9230769230769231,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.14285714285714285,
                            "true_positive_rate": 0.8571428571428571
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9230769230769231,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.14285714285714285,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 26,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.9285714285714286,
                            "precision": 1.0,
                            "recall": 0.9285714285714286,
                            "f1": 0.9629629629629629,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.07142857142857142,
                            "true_positive_rate": 0.9285714285714286
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9285714285714286,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9285714285714286,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9629629629629629,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.07142857142857142,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9285714285714286,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 7,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 16,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.5714285714285714,
                            "precision": 1.0,
                            "recall": 0.5714285714285714,
                            "f1": 0.7272727272727273,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.42857142857142855,
                            "true_positive_rate": 0.5714285714285714
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5714285714285714,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5714285714285714,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7272727272727273,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.42857142857142855,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5714285714285714,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 21,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.75,
                            "precision": 1.0,
                            "recall": 0.75,
                            "f1": 0.8571428571428571,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25,
                            "true_positive_rate": 0.75
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 23,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8214285714285714,
                            "precision": 1.0,
                            "recall": 0.8214285714285714,
                            "f1": 0.9019607843137255,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.17857142857142858,
                            "true_positive_rate": 0.8214285714285714
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8214285714285714,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8214285714285714,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9019607843137255,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.17857142857142858,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8214285714285714,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.6623376623376622,
                            "stdev": 0.24096071838025285
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6623376623376622,
                            "stdev": 0.24096071838025285
                        },
                        "f1": {
                            "average": 0.7687829320007883,
                            "stdev": 0.19538385951460757
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.3376623376623376,
                            "stdev": 0.24096071838025288
                        },
                        "true_positive_rate": {
                            "average": 0.6623376623376622,
                            "stdev": 0.24096071838025285
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 39,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.4936708860759494,
                            "precision": 1.0,
                            "recall": 0.4936708860759494,
                            "f1": 0.6610169491525424,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5063291139240507,
                            "true_positive_rate": 0.4936708860759494
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4936708860759494,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4936708860759494,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6610169491525424,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5063291139240507,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.4936708860759494,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 24,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.3037974683544304,
                            "precision": 1.0,
                            "recall": 0.3037974683544304,
                            "f1": 0.46601941747572817,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6962025316455697,
                            "true_positive_rate": 0.3037974683544304
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3037974683544304,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3037974683544304,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.46601941747572817,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6962025316455697,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3037974683544304,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 53,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.6708860759493671,
                            "precision": 1.0,
                            "recall": 0.6708860759493671,
                            "f1": 0.803030303030303,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3291139240506329,
                            "true_positive_rate": 0.6708860759493671
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6708860759493671,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6708860759493671,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.803030303030303,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3291139240506329,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.6708860759493671,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 72,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9113924050632911,
                            "precision": 1.0,
                            "recall": 0.9113924050632911,
                            "f1": 0.9536423841059603,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.08860759493670886,
                            "true_positive_rate": 0.9113924050632911
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9113924050632911,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9113924050632911,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9536423841059603,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.08860759493670886,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9113924050632911,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 63,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.7974683544303798,
                            "precision": 1.0,
                            "recall": 0.7974683544303798,
                            "f1": 0.8873239436619719,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.20253164556962025,
                            "true_positive_rate": 0.7974683544303798
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7974683544303798,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7974683544303798,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8873239436619719,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.20253164556962025,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7974683544303798,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 68,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.8607594936708861,
                            "precision": 1.0,
                            "recall": 0.8607594936708861,
                            "f1": 0.9251700680272109,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.13924050632911392,
                            "true_positive_rate": 0.8607594936708861
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8607594936708861,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8607594936708861,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9251700680272109,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.13924050632911392,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8607594936708861,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 35,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.4430379746835443,
                            "precision": 1.0,
                            "recall": 0.4430379746835443,
                            "f1": 0.6140350877192983,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5569620253164557,
                            "true_positive_rate": 0.4430379746835443
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4430379746835443,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4430379746835443,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6140350877192983,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5569620253164557,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.4430379746835443,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 69,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.8734177215189873,
                            "precision": 1.0,
                            "recall": 0.8734177215189873,
                            "f1": 0.9324324324324325,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.12658227848101267,
                            "true_positive_rate": 0.8734177215189873
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8734177215189873,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8734177215189873,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9324324324324325,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.12658227848101267,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8734177215189873,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 72,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9113924050632911,
                            "precision": 1.0,
                            "recall": 0.9113924050632911,
                            "f1": 0.9536423841059603,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.08860759493670886,
                            "true_positive_rate": 0.9113924050632911
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9113924050632911,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9113924050632911,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9536423841059603,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.08860759493670886,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9113924050632911,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 79,
                        "prediction_error_num": 76,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": 0.9620253164556962,
                            "precision": 1.0,
                            "recall": 0.9620253164556962,
                            "f1": 0.9806451612903225,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0379746835443038,
                            "true_positive_rate": 0.9620253164556962
                        }
                    },
                    "average": {
                        "total_num": 79,
                        "gold_error_num": 79,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9620253164556962,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9620253164556962,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9806451612903225,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0379746835443038,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9620253164556962,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.7479861910241656,
                            "stdev": 0.22452859666065497
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.7479861910241656,
                            "stdev": 0.22452859666065497
                        },
                        "f1": {
                            "average": 0.8342689210001574,
                            "stdev": 0.16874119660867387
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.2520138089758343,
                            "stdev": 0.224528596660655
                        },
                        "true_positive_rate": {
                            "average": 0.7479861910241656,
                            "stdev": 0.22452859666065497
                        }
                    }
                }
            }
        },
        "Instruction-Following": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 29,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.4393939393939394,
                            "precision": 1.0,
                            "recall": 0.4393939393939394,
                            "f1": 0.6105263157894737,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5606060606060606,
                            "true_positive_rate": 0.4393939393939394
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4393939393939394,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4393939393939394,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6105263157894737,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5606060606060606,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.4393939393939394,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 16,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.24242424242424243,
                            "precision": 1.0,
                            "recall": 0.24242424242424243,
                            "f1": 0.3902439024390244,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7575757575757576,
                            "true_positive_rate": 0.24242424242424243
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.24242424242424243,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.24242424242424243,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.3902439024390244,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7575757575757576,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.24242424242424243,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 65,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.9848484848484849,
                            "precision": 1.0,
                            "recall": 0.9848484848484849,
                            "f1": 0.9923664122137404,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.015151515151515152,
                            "true_positive_rate": 0.9848484848484849
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9848484848484849,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9848484848484849,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9923664122137404,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.015151515151515152,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9848484848484849,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 36,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.5454545454545454,
                            "precision": 1.0,
                            "recall": 0.5454545454545454,
                            "f1": 0.7058823529411765,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.45454545454545453,
                            "true_positive_rate": 0.5454545454545454
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5454545454545454,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5454545454545454,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7058823529411765,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.45454545454545453,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5454545454545454,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 46,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.696969696969697,
                            "precision": 1.0,
                            "recall": 0.696969696969697,
                            "f1": 0.8214285714285714,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.30303030303030304,
                            "true_positive_rate": 0.696969696969697
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.696969696969697,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.696969696969697,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8214285714285714,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.30303030303030304,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.696969696969697,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 52,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.7878787878787878,
                            "precision": 1.0,
                            "recall": 0.7878787878787878,
                            "f1": 0.8813559322033898,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.21212121212121213,
                            "true_positive_rate": 0.7878787878787878
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7878787878787878,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7878787878787878,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8813559322033898,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.21212121212121213,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7878787878787878,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 58,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.8787878787878788,
                            "precision": 1.0,
                            "recall": 0.8787878787878788,
                            "f1": 0.9354838709677419,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.12121212121212122,
                            "true_positive_rate": 0.8787878787878788
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8787878787878788,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8787878787878788,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9354838709677419,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.12121212121212122,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8787878787878788,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 21,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.3181818181818182,
                            "precision": 1.0,
                            "recall": 0.3181818181818182,
                            "f1": 0.4827586206896552,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6818181818181818,
                            "true_positive_rate": 0.3181818181818182
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3181818181818182,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3181818181818182,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4827586206896552,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6818181818181818,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3181818181818182,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 40,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.6060606060606061,
                            "precision": 1.0,
                            "recall": 0.6060606060606061,
                            "f1": 0.7547169811320755,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3939393939393939,
                            "true_positive_rate": 0.6060606060606061
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6060606060606061,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6060606060606061,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7547169811320755,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3939393939393939,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.6060606060606061,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 44,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.6666666666666666,
                            "precision": 1.0,
                            "recall": 0.6666666666666666,
                            "f1": 0.8,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3333333333333333,
                            "true_positive_rate": 0.6666666666666666
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6666666666666666,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6666666666666666,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3333333333333333,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.6666666666666666,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 66,
                        "prediction_error_num": 38,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": 0.5757575757575758,
                            "precision": 1.0,
                            "recall": 0.5757575757575758,
                            "f1": 0.7307692307692307,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.42424242424242425,
                            "true_positive_rate": 0.5757575757575758
                        }
                    },
                    "average": {
                        "total_num": 66,
                        "gold_error_num": 66,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5757575757575758,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5757575757575758,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7307692307692307,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.42424242424242425,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5757575757575758,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.6129476584022039,
                            "stdev": 0.21528180941667255
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6129476584022039,
                            "stdev": 0.21528180941667255
                        },
                        "f1": {
                            "average": 0.7368665627794617,
                            "stdev": 0.17548337380656284
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.38705234159779617,
                            "stdev": 0.21528180941667255
                        },
                        "true_positive_rate": {
                            "average": 0.6129476584022039,
                            "stdev": 0.21528180941667255
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 49,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.48514851485148514,
                            "precision": 1.0,
                            "recall": 0.48514851485148514,
                            "f1": 0.6533333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5148514851485149,
                            "true_positive_rate": 0.48514851485148514
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.48514851485148514,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.48514851485148514,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6533333333333333,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5148514851485149,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.48514851485148514,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 33,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.32673267326732675,
                            "precision": 1.0,
                            "recall": 0.32673267326732675,
                            "f1": 0.4925373134328358,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6732673267326733,
                            "true_positive_rate": 0.32673267326732675
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.32673267326732675,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.32673267326732675,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4925373134328358,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6732673267326733,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.32673267326732675,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 100,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.9900990099009901,
                            "precision": 1.0,
                            "recall": 0.9900990099009901,
                            "f1": 0.9950248756218906,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.009900990099009901,
                            "true_positive_rate": 0.9900990099009901
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9900990099009901,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9900990099009901,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9950248756218906,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.009900990099009901,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9900990099009901,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 70,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.693069306930693,
                            "precision": 1.0,
                            "recall": 0.693069306930693,
                            "f1": 0.8187134502923976,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3069306930693069,
                            "true_positive_rate": 0.693069306930693
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.693069306930693,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.693069306930693,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8187134502923976,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3069306930693069,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.693069306930693,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 84,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8316831683168316,
                            "precision": 1.0,
                            "recall": 0.8316831683168316,
                            "f1": 0.9081081081081082,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.16831683168316833,
                            "true_positive_rate": 0.8316831683168316
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8316831683168316,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8316831683168316,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9081081081081082,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.16831683168316833,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8316831683168316,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 74,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.7326732673267327,
                            "precision": 1.0,
                            "recall": 0.7326732673267327,
                            "f1": 0.8457142857142858,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.26732673267326734,
                            "true_positive_rate": 0.7326732673267327
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7326732673267327,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7326732673267327,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8457142857142858,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.26732673267326734,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7326732673267327,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 81,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.801980198019802,
                            "precision": 1.0,
                            "recall": 0.801980198019802,
                            "f1": 0.8901098901098901,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.19801980198019803,
                            "true_positive_rate": 0.801980198019802
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.801980198019802,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.801980198019802,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8901098901098901,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.19801980198019803,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.801980198019802,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 47,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.46534653465346537,
                            "precision": 1.0,
                            "recall": 0.46534653465346537,
                            "f1": 0.6351351351351351,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5346534653465347,
                            "true_positive_rate": 0.46534653465346537
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.46534653465346537,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.46534653465346537,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6351351351351351,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5346534653465347,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.46534653465346537,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 88,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8712871287128713,
                            "precision": 1.0,
                            "recall": 0.8712871287128713,
                            "f1": 0.9312169312169312,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.12871287128712872,
                            "true_positive_rate": 0.8712871287128713
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8712871287128713,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8712871287128713,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9312169312169312,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.12871287128712872,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8712871287128713,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 85,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8415841584158416,
                            "precision": 1.0,
                            "recall": 0.8415841584158416,
                            "f1": 0.9139784946236559,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.15841584158415842,
                            "true_positive_rate": 0.8415841584158416
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8415841584158416,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8415841584158416,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9139784946236559,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.15841584158415842,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8415841584158416,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 101,
                        "prediction_error_num": 90,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": 0.8910891089108911,
                            "precision": 1.0,
                            "recall": 0.8910891089108911,
                            "f1": 0.9424083769633508,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.10891089108910891,
                            "true_positive_rate": 0.8910891089108911
                        }
                    },
                    "average": {
                        "total_num": 101,
                        "gold_error_num": 101,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8910891089108911,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8910891089108911,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9424083769633508,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.10891089108910891,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8910891089108911,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.7209720972097209,
                            "stdev": 0.19877525481122904
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.7209720972097209,
                            "stdev": 0.19877525481122904
                        },
                        "f1": {
                            "average": 0.8205709267774377,
                            "stdev": 0.15055685056896054
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.2790279027902791,
                            "stdev": 0.19877525481122907
                        },
                        "true_positive_rate": {
                            "average": 0.7209720972097209,
                            "stdev": 0.19877525481122904
                        }
                    }
                }
            }
        }
    },
    "finegrained_fact_verification": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 33,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.9705882352941176,
                            "precision": 1.0,
                            "recall": 0.9705882352941176,
                            "f1": 0.9850746268656716,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.029411764705882353,
                            "true_positive_rate": 0.9705882352941176
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9705882352941176,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9705882352941176,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9850746268656716,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.029411764705882353,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9705882352941176,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 32,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.9411764705882353,
                            "precision": 1.0,
                            "recall": 0.9411764705882353,
                            "f1": 0.9696969696969697,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.058823529411764705,
                            "true_positive_rate": 0.9411764705882353
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9411764705882353,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9411764705882353,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9696969696969697,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.058823529411764705,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9411764705882353,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 26,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.7647058823529411,
                            "precision": 1.0,
                            "recall": 0.7647058823529411,
                            "f1": 0.8666666666666667,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.23529411764705882,
                            "true_positive_rate": 0.7647058823529411
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7647058823529411,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7647058823529411,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8666666666666667,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.23529411764705882,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7647058823529411,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 25,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.7352941176470589,
                            "precision": 1.0,
                            "recall": 0.7352941176470589,
                            "f1": 0.847457627118644,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2647058823529412,
                            "true_positive_rate": 0.7352941176470589
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7352941176470589,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7352941176470589,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.847457627118644,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2647058823529412,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7352941176470589,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 24,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.7058823529411765,
                            "precision": 1.0,
                            "recall": 0.7058823529411765,
                            "f1": 0.8275862068965517,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.29411764705882354,
                            "true_positive_rate": 0.7058823529411765
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7058823529411765,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7058823529411765,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8275862068965517,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.29411764705882354,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7058823529411765,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 16,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.47058823529411764,
                            "precision": 1.0,
                            "recall": 0.47058823529411764,
                            "f1": 0.64,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5294117647058824,
                            "true_positive_rate": 0.47058823529411764
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.47058823529411764,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.47058823529411764,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.64,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5294117647058824,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.47058823529411764,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 12,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.35294117647058826,
                            "precision": 1.0,
                            "recall": 0.35294117647058826,
                            "f1": 0.5217391304347826,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6470588235294118,
                            "true_positive_rate": 0.35294117647058826
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.35294117647058826,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.35294117647058826,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5217391304347826,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6470588235294118,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.35294117647058826,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 3,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.08823529411764706,
                            "precision": 1.0,
                            "recall": 0.08823529411764706,
                            "f1": 0.16216216216216217,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9117647058823529,
                            "true_positive_rate": 0.08823529411764706
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.08823529411764706,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.08823529411764706,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.16216216216216217,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9117647058823529,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.08823529411764706,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 34,
                        "prediction_error_num": 9,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": 0.2647058823529412,
                            "precision": 1.0,
                            "recall": 0.2647058823529412,
                            "f1": 0.4186046511627907,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7352941176470589,
                            "true_positive_rate": 0.2647058823529412
                        }
                    },
                    "average": {
                        "total_num": 34,
                        "gold_error_num": 34,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2647058823529412,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2647058823529412,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4186046511627907,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7352941176470589,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2647058823529412,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.6631016042780747,
                            "stdev": 0.3072889390066019
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6631016042780747,
                            "stdev": 0.3072889390066019
                        },
                        "f1": {
                            "average": 0.7489989128185672,
                            "stdev": 0.2658132905654993
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.3368983957219251,
                            "stdev": 0.307288939006602
                        },
                        "true_positive_rate": {
                            "average": 0.6631016042780747,
                            "stdev": 0.3072889390066019
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 75,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.8426966292134831,
                            "precision": 1.0,
                            "recall": 0.8426966292134831,
                            "f1": 0.9146341463414634,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.15730337078651685,
                            "true_positive_rate": 0.8426966292134831
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8426966292134831,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8426966292134831,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9146341463414634,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.15730337078651685,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8426966292134831,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 75,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.8426966292134831,
                            "precision": 1.0,
                            "recall": 0.8426966292134831,
                            "f1": 0.9146341463414634,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.15730337078651685,
                            "true_positive_rate": 0.8426966292134831
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8426966292134831,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8426966292134831,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9146341463414634,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.15730337078651685,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8426966292134831,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 57,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.6404494382022472,
                            "precision": 1.0,
                            "recall": 0.6404494382022472,
                            "f1": 0.7808219178082192,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3595505617977528,
                            "true_positive_rate": 0.6404494382022472
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6404494382022472,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6404494382022472,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7808219178082192,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3595505617977528,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.6404494382022472,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 19,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.21348314606741572,
                            "precision": 1.0,
                            "recall": 0.21348314606741572,
                            "f1": 0.35185185185185186,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7865168539325843,
                            "true_positive_rate": 0.21348314606741572
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.21348314606741572,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.21348314606741572,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.35185185185185186,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7865168539325843,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.21348314606741572,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 32,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.3595505617977528,
                            "precision": 1.0,
                            "recall": 0.3595505617977528,
                            "f1": 0.5289256198347108,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6404494382022472,
                            "true_positive_rate": 0.3595505617977528
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3595505617977528,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3595505617977528,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5289256198347108,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6404494382022472,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3595505617977528,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 20,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.2247191011235955,
                            "precision": 1.0,
                            "recall": 0.2247191011235955,
                            "f1": 0.3669724770642202,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7752808988764045,
                            "true_positive_rate": 0.2247191011235955
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2247191011235955,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2247191011235955,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.3669724770642202,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7752808988764045,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2247191011235955,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 33,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.3707865168539326,
                            "precision": 1.0,
                            "recall": 0.3707865168539326,
                            "f1": 0.5409836065573771,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6292134831460674,
                            "true_positive_rate": 0.3707865168539326
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3707865168539326,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3707865168539326,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5409836065573771,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6292134831460674,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3707865168539326,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 50,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.5617977528089888,
                            "precision": 1.0,
                            "recall": 0.5617977528089888,
                            "f1": 0.7194244604316546,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.43820224719101125,
                            "true_positive_rate": 0.5617977528089888
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5617977528089888,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5617977528089888,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7194244604316546,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.43820224719101125,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5617977528089888,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 20,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.2247191011235955,
                            "precision": 1.0,
                            "recall": 0.2247191011235955,
                            "f1": 0.3669724770642202,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7752808988764045,
                            "true_positive_rate": 0.2247191011235955
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2247191011235955,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2247191011235955,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.3669724770642202,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7752808988764045,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2247191011235955,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 89,
                        "prediction_error_num": 47,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": 0.5280898876404494,
                            "precision": 1.0,
                            "recall": 0.5280898876404494,
                            "f1": 0.6911764705882353,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.47191011235955055,
                            "true_positive_rate": 0.5280898876404494
                        }
                    },
                    "average": {
                        "total_num": 89,
                        "gold_error_num": 89,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5280898876404494,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5280898876404494,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6911764705882353,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.47191011235955055,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5280898876404494,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.5280898876404495,
                            "stdev": 0.2646798703568826
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5280898876404495,
                            "stdev": 0.2646798703568826
                        },
                        "f1": {
                            "average": 0.6523997430803106,
                            "stdev": 0.22598749025994516
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.47191011235955055,
                            "stdev": 0.2646798703568827
                        },
                        "true_positive_rate": {
                            "average": 0.5280898876404495,
                            "stdev": 0.2646798703568826
                        }
                    }
                }
            }
        },
        "Instruction-Following": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 7,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.875,
                            "precision": 1.0,
                            "recall": 0.875,
                            "f1": 0.9333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.125,
                            "true_positive_rate": 0.875
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.875,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.875,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9333333333333333,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.125,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.875,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 7,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.875,
                            "precision": 1.0,
                            "recall": 0.875,
                            "f1": 0.9333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.125,
                            "true_positive_rate": 0.875
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.875,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.875,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9333333333333333,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.125,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.875,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 7,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.875,
                            "precision": 1.0,
                            "recall": 0.875,
                            "f1": 0.9333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.125,
                            "true_positive_rate": 0.875
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.875,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.875,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9333333333333333,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.125,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.875,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 5,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.625,
                            "precision": 1.0,
                            "recall": 0.625,
                            "f1": 0.7692307692307693,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.375,
                            "true_positive_rate": 0.625
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.625,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.625,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7692307692307693,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.375,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.625,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 1,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.125,
                            "precision": 1.0,
                            "recall": 0.125,
                            "f1": 0.2222222222222222,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.875,
                            "true_positive_rate": 0.125
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.125,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.125,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.2222222222222222,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.875,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.125,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 8,
                        "prediction_error_num": 2,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 8,
                        "gold_error_num": 8,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.5113636363636364,
                            "stdev": 0.3218136876933249
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5113636363636364,
                            "stdev": 0.3218136876933249
                        },
                        "f1": {
                            "average": 0.6174048174048175,
                            "stdev": 0.27983050577945506
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.48863636363636365,
                            "stdev": 0.3218136876933249
                        },
                        "true_positive_rate": {
                            "average": 0.5113636363636364,
                            "stdev": 0.3218136876933249
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 60,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.8450704225352113,
                            "precision": 1.0,
                            "recall": 0.8450704225352113,
                            "f1": 0.916030534351145,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.15492957746478872,
                            "true_positive_rate": 0.8450704225352113
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8450704225352113,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8450704225352113,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.916030534351145,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.15492957746478872,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8450704225352113,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 60,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.8450704225352113,
                            "precision": 1.0,
                            "recall": 0.8450704225352113,
                            "f1": 0.916030534351145,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.15492957746478872,
                            "true_positive_rate": 0.8450704225352113
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8450704225352113,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8450704225352113,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.916030534351145,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.15492957746478872,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8450704225352113,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 70,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.9859154929577465,
                            "precision": 1.0,
                            "recall": 0.9859154929577465,
                            "f1": 0.9929078014184397,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.014084507042253521,
                            "true_positive_rate": 0.9859154929577465
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9859154929577465,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9859154929577465,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9929078014184397,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.014084507042253521,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9859154929577465,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 45,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.6338028169014085,
                            "precision": 1.0,
                            "recall": 0.6338028169014085,
                            "f1": 0.7758620689655172,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.36619718309859156,
                            "true_positive_rate": 0.6338028169014085
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6338028169014085,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6338028169014085,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7758620689655172,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.36619718309859156,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.6338028169014085,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 11,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.15492957746478872,
                            "precision": 1.0,
                            "recall": 0.15492957746478872,
                            "f1": 0.2682926829268293,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8450704225352113,
                            "true_positive_rate": 0.15492957746478872
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.15492957746478872,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.15492957746478872,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.2682926829268293,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8450704225352113,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.15492957746478872,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 28,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.39436619718309857,
                            "precision": 1.0,
                            "recall": 0.39436619718309857,
                            "f1": 0.5656565656565656,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6056338028169014,
                            "true_positive_rate": 0.39436619718309857
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.39436619718309857,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.39436619718309857,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5656565656565656,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6056338028169014,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.39436619718309857,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 21,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.29577464788732394,
                            "precision": 1.0,
                            "recall": 0.29577464788732394,
                            "f1": 0.45652173913043476,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.704225352112676,
                            "true_positive_rate": 0.29577464788732394
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.29577464788732394,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.29577464788732394,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.45652173913043476,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.704225352112676,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.29577464788732394,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 23,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.323943661971831,
                            "precision": 1.0,
                            "recall": 0.323943661971831,
                            "f1": 0.48936170212765956,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.676056338028169,
                            "true_positive_rate": 0.323943661971831
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.323943661971831,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.323943661971831,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.48936170212765956,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.676056338028169,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.323943661971831,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 39,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.5492957746478874,
                            "precision": 1.0,
                            "recall": 0.5492957746478874,
                            "f1": 0.7090909090909091,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4507042253521127,
                            "true_positive_rate": 0.5492957746478874
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5492957746478874,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5492957746478874,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7090909090909091,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4507042253521127,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5492957746478874,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 22,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.30985915492957744,
                            "precision": 1.0,
                            "recall": 0.30985915492957744,
                            "f1": 0.4731182795698925,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6901408450704225,
                            "true_positive_rate": 0.30985915492957744
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.30985915492957744,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.30985915492957744,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4731182795698925,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6901408450704225,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.30985915492957744,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 71,
                        "prediction_error_num": 38,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": 0.5352112676056338,
                            "precision": 1.0,
                            "recall": 0.5352112676056338,
                            "f1": 0.6972477064220184,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4647887323943662,
                            "true_positive_rate": 0.5352112676056338
                        }
                    },
                    "average": {
                        "total_num": 71,
                        "gold_error_num": 71,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5352112676056338,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5352112676056338,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6972477064220184,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4647887323943662,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5352112676056338,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.5339308578745198,
                            "stdev": 0.25624194745441103
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5339308578745198,
                            "stdev": 0.25624194745441103
                        },
                        "f1": {
                            "average": 0.6600109567282324,
                            "stdev": 0.21886756365220747
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.4660691421254801,
                            "stdev": 0.256241947454411
                        },
                        "true_positive_rate": {
                            "average": 0.5339308578745198,
                            "stdev": 0.25624194745441103
                        }
                    }
                }
            }
        },
        "Context-Faithfulness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 48,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.9230769230769231,
                            "precision": 1.0,
                            "recall": 0.9230769230769231,
                            "f1": 0.96,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.07692307692307693,
                            "true_positive_rate": 0.9230769230769231
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9230769230769231,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9230769230769231,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.96,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.07692307692307693,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9230769230769231,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 40,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.7692307692307693,
                            "precision": 1.0,
                            "recall": 0.7692307692307693,
                            "f1": 0.8695652173913043,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.23076923076923078,
                            "true_positive_rate": 0.7692307692307693
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7692307692307693,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7692307692307693,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8695652173913043,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.23076923076923078,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7692307692307693,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 49,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.9423076923076923,
                            "precision": 1.0,
                            "recall": 0.9423076923076923,
                            "f1": 0.9702970297029703,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.057692307692307696,
                            "true_positive_rate": 0.9423076923076923
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9423076923076923,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9423076923076923,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9702970297029703,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.057692307692307696,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9423076923076923,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 28,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.5384615384615384,
                            "precision": 1.0,
                            "recall": 0.5384615384615384,
                            "f1": 0.7,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.46153846153846156,
                            "true_positive_rate": 0.5384615384615384
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.46153846153846156,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 22,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.4230769230769231,
                            "precision": 1.0,
                            "recall": 0.4230769230769231,
                            "f1": 0.5945945945945946,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5769230769230769,
                            "true_positive_rate": 0.4230769230769231
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4230769230769231,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4230769230769231,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5945945945945946,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5769230769230769,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.4230769230769231,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 21,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.40384615384615385,
                            "precision": 1.0,
                            "recall": 0.40384615384615385,
                            "f1": 0.5753424657534246,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5961538461538461,
                            "true_positive_rate": 0.40384615384615385
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.40384615384615385,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.40384615384615385,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5753424657534246,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5961538461538461,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.40384615384615385,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 17,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.3269230769230769,
                            "precision": 1.0,
                            "recall": 0.3269230769230769,
                            "f1": 0.4927536231884058,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6730769230769231,
                            "true_positive_rate": 0.3269230769230769
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3269230769230769,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3269230769230769,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4927536231884058,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6730769230769231,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3269230769230769,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 7,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.1346153846153846,
                            "precision": 1.0,
                            "recall": 0.1346153846153846,
                            "f1": 0.23728813559322035,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8653846153846154,
                            "true_positive_rate": 0.1346153846153846
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1346153846153846,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.1346153846153846,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.23728813559322035,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8653846153846154,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.1346153846153846,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 4,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.07692307692307693,
                            "precision": 1.0,
                            "recall": 0.07692307692307693,
                            "f1": 0.14285714285714285,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9230769230769231,
                            "true_positive_rate": 0.07692307692307693
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.07692307692307693,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.07692307692307693,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.14285714285714285,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9230769230769231,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.07692307692307693,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 52,
                        "prediction_error_num": 9,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": 0.17307692307692307,
                            "precision": 1.0,
                            "recall": 0.17307692307692307,
                            "f1": 0.29508196721311475,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8269230769230769,
                            "true_positive_rate": 0.17307692307692307
                        }
                    },
                    "average": {
                        "total_num": 52,
                        "gold_error_num": 52,
                        "metrics": {
                            "accuracy": {
                                "average": 0.17307692307692307,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.17307692307692307,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.29508196721311475,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8269230769230769,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.17307692307692307,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.5192307692307693,
                            "stdev": 0.32460096743247047
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5192307692307693,
                            "stdev": 0.32460096743247047
                        },
                        "f1": {
                            "average": 0.6216163796631071,
                            "stdev": 0.2938612602365058
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.4807692307692308,
                            "stdev": 0.32460096743247047
                        },
                        "true_positive_rate": {
                            "average": 0.5192307692307693,
                            "stdev": 0.32460096743247047
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 59,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.8082191780821918,
                            "precision": 1.0,
                            "recall": 0.8082191780821918,
                            "f1": 0.8939393939393939,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.1917808219178082,
                            "true_positive_rate": 0.8082191780821918
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8082191780821918,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8082191780821918,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8939393939393939,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.1917808219178082,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8082191780821918,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 63,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.863013698630137,
                            "precision": 1.0,
                            "recall": 0.863013698630137,
                            "f1": 0.9264705882352942,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.136986301369863,
                            "true_positive_rate": 0.863013698630137
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.863013698630137,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.863013698630137,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9264705882352942,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.136986301369863,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.863013698630137,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 72,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.9863013698630136,
                            "precision": 1.0,
                            "recall": 0.9863013698630136,
                            "f1": 0.993103448275862,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0136986301369863,
                            "true_positive_rate": 0.9863013698630136
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9863013698630136,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9863013698630136,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.993103448275862,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0136986301369863,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9863013698630136,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 43,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.589041095890411,
                            "precision": 1.0,
                            "recall": 0.589041095890411,
                            "f1": 0.7413793103448276,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.410958904109589,
                            "true_positive_rate": 0.589041095890411
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.589041095890411,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.589041095890411,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7413793103448276,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.410958904109589,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.589041095890411,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 21,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.2876712328767123,
                            "precision": 1.0,
                            "recall": 0.2876712328767123,
                            "f1": 0.44680851063829785,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7123287671232876,
                            "true_positive_rate": 0.2876712328767123
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2876712328767123,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2876712328767123,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.44680851063829785,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7123287671232876,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2876712328767123,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 27,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.3698630136986301,
                            "precision": 1.0,
                            "recall": 0.3698630136986301,
                            "f1": 0.54,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6301369863013698,
                            "true_positive_rate": 0.3698630136986301
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3698630136986301,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3698630136986301,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.54,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6301369863013698,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3698630136986301,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 18,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.2465753424657534,
                            "precision": 1.0,
                            "recall": 0.2465753424657534,
                            "f1": 0.3956043956043956,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7534246575342466,
                            "true_positive_rate": 0.2465753424657534
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2465753424657534,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2465753424657534,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.3956043956043956,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7534246575342466,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2465753424657534,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 24,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.3287671232876712,
                            "precision": 1.0,
                            "recall": 0.3287671232876712,
                            "f1": 0.4948453608247423,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6712328767123288,
                            "true_positive_rate": 0.3287671232876712
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3287671232876712,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3287671232876712,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4948453608247423,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6712328767123288,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3287671232876712,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 35,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.4794520547945205,
                            "precision": 1.0,
                            "recall": 0.4794520547945205,
                            "f1": 0.6481481481481481,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.5205479452054794,
                            "true_positive_rate": 0.4794520547945205
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.4794520547945205,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.4794520547945205,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.6481481481481481,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.5205479452054794,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.4794520547945205,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 23,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.3150684931506849,
                            "precision": 1.0,
                            "recall": 0.3150684931506849,
                            "f1": 0.4791666666666667,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.684931506849315,
                            "true_positive_rate": 0.3150684931506849
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3150684931506849,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3150684931506849,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4791666666666667,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.684931506849315,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3150684931506849,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 73,
                        "prediction_error_num": 46,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": 0.6301369863013698,
                            "precision": 1.0,
                            "recall": 0.6301369863013698,
                            "f1": 0.773109243697479,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3698630136986301,
                            "true_positive_rate": 0.6301369863013698
                        }
                    },
                    "average": {
                        "total_num": 73,
                        "gold_error_num": 73,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6301369863013698,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6301369863013698,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.773109243697479,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3698630136986301,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.6301369863013698,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.5367372353673724,
                            "stdev": 0.24559221343122728
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.5367372353673724,
                            "stdev": 0.24559221343122728
                        },
                        "f1": {
                            "average": 0.6665977333068279,
                            "stdev": 0.201246670520547
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.46326276463262767,
                            "stdev": 0.24559221343122728
                        },
                        "true_positive_rate": {
                            "average": 0.5367372353673724,
                            "stdev": 0.24559221343122728
                        }
                    }
                }
            }
        }
    },
    "answerability_classification": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 24,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8571428571428571,
                            "precision": 1.0,
                            "recall": 0.8571428571428571,
                            "f1": 0.9230769230769231,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.14285714285714285,
                            "true_positive_rate": 0.8571428571428571
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9230769230769231,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.14285714285714285,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 23,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.8214285714285714,
                            "precision": 1.0,
                            "recall": 0.8214285714285714,
                            "f1": 0.9019607843137255,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.17857142857142858,
                            "true_positive_rate": 0.8214285714285714
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8214285714285714,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8214285714285714,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9019607843137255,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.17857142857142858,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8214285714285714,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 10,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.35714285714285715,
                            "precision": 1.0,
                            "recall": 0.35714285714285715,
                            "f1": 0.5263157894736842,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6428571428571429,
                            "true_positive_rate": 0.35714285714285715
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.35714285714285715,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.35714285714285715,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5263157894736842,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6428571428571429,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.35714285714285715,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 20,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.7142857142857143,
                            "precision": 1.0,
                            "recall": 0.7142857142857143,
                            "f1": 0.8333333333333334,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2857142857142857,
                            "true_positive_rate": 0.7142857142857143
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7142857142857143,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7142857142857143,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8333333333333334,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2857142857142857,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7142857142857143,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 9,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.32142857142857145,
                            "precision": 1.0,
                            "recall": 0.32142857142857145,
                            "f1": 0.4864864864864865,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6785714285714286,
                            "true_positive_rate": 0.32142857142857145
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.32142857142857145,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.32142857142857145,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4864864864864865,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6785714285714286,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.32142857142857145,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 7,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 6,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.21428571428571427,
                            "precision": 1.0,
                            "recall": 0.21428571428571427,
                            "f1": 0.35294117647058826,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7857142857142857,
                            "true_positive_rate": 0.21428571428571427
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.21428571428571427,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.21428571428571427,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.35294117647058826,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7857142857142857,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.21428571428571427,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 7,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.25,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.75,
                            "true_positive_rate": 0.25
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.25,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 3,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.10714285714285714,
                            "precision": 1.0,
                            "recall": 0.10714285714285714,
                            "f1": 0.1935483870967742,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8928571428571429,
                            "true_positive_rate": 0.10714285714285714
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.10714285714285714,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.10714285714285714,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.1935483870967742,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8928571428571429,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.10714285714285714,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 28,
                        "prediction_error_num": 9,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": 0.32142857142857145,
                            "precision": 1.0,
                            "recall": 0.32142857142857145,
                            "f1": 0.4864864864864865,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6785714285714286,
                            "true_positive_rate": 0.32142857142857145
                        }
                    },
                    "average": {
                        "total_num": 28,
                        "gold_error_num": 28,
                        "metrics": {
                            "accuracy": {
                                "average": 0.32142857142857145,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.32142857142857145,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4864864864864865,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6785714285714286,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.32142857142857145,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.47402597402597396,
                            "stdev": 0.296113864038191
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.47402597402597396,
                            "stdev": 0.296113864038191
                        },
                        "f1": {
                            "average": 0.5912863060670911,
                            "stdev": 0.2605255560920444
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.5259740259740261,
                            "stdev": 0.296113864038191
                        },
                        "true_positive_rate": {
                            "average": 0.47402597402597396,
                            "stdev": 0.296113864038191
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 19,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.31666666666666665,
                            "precision": 1.0,
                            "recall": 0.31666666666666665,
                            "f1": 0.4810126582278481,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6833333333333333,
                            "true_positive_rate": 0.31666666666666665
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.31666666666666665,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.31666666666666665,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4810126582278481,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6833333333333333,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.31666666666666665,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 42,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.7,
                            "precision": 1.0,
                            "recall": 0.7,
                            "f1": 0.8235294117647058,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.3,
                            "true_positive_rate": 0.7
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8235294117647058,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 58,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.9666666666666667,
                            "precision": 1.0,
                            "recall": 0.9666666666666667,
                            "f1": 0.9830508474576272,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.03333333333333333,
                            "true_positive_rate": 0.9666666666666667
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9666666666666667,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9666666666666667,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9830508474576272,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.03333333333333333,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9666666666666667,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 4,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.06666666666666667,
                            "precision": 1.0,
                            "recall": 0.06666666666666667,
                            "f1": 0.125,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9333333333333333,
                            "true_positive_rate": 0.06666666666666667
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.06666666666666667,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.06666666666666667,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.125,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9333333333333333,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.06666666666666667,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 17,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.2833333333333333,
                            "precision": 1.0,
                            "recall": 0.2833333333333333,
                            "f1": 0.44155844155844154,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7166666666666667,
                            "true_positive_rate": 0.2833333333333333
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2833333333333333,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2833333333333333,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.44155844155844154,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7166666666666667,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2833333333333333,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 10,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.16666666666666666,
                            "precision": 1.0,
                            "recall": 0.16666666666666666,
                            "f1": 0.2857142857142857,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8333333333333334,
                            "true_positive_rate": 0.16666666666666666
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.16666666666666666,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.16666666666666666,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.2857142857142857,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8333333333333334,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.16666666666666666,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 1,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.016666666666666666,
                            "precision": 1.0,
                            "recall": 0.016666666666666666,
                            "f1": 0.03278688524590164,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9833333333333333,
                            "true_positive_rate": 0.016666666666666666
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.016666666666666666,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.016666666666666666,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.03278688524590164,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9833333333333333,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.016666666666666666,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 6,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.1,
                            "precision": 1.0,
                            "recall": 0.1,
                            "f1": 0.18181818181818182,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9,
                            "true_positive_rate": 0.1
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.1,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.18181818181818182,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.1,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 11,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.18333333333333332,
                            "precision": 1.0,
                            "recall": 0.18333333333333332,
                            "f1": 0.30985915492957744,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8166666666666667,
                            "true_positive_rate": 0.18333333333333332
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.18333333333333332,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.18333333333333332,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.30985915492957744,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8166666666666667,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.18333333333333332,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 35,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.5833333333333334,
                            "precision": 1.0,
                            "recall": 0.5833333333333334,
                            "f1": 0.7368421052631579,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4166666666666667,
                            "true_positive_rate": 0.5833333333333334
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5833333333333334,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5833333333333334,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7368421052631579,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4166666666666667,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5833333333333334,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 60,
                        "prediction_error_num": 48,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.8,
                            "precision": 1.0,
                            "recall": 0.8,
                            "f1": 0.8888888888888888,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2,
                            "true_positive_rate": 0.8
                        }
                    },
                    "average": {
                        "total_num": 60,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8888888888888888,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.38030303030303025,
                            "stdev": 0.31195199885042707
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.38030303030303025,
                            "stdev": 0.31195199885042707
                        },
                        "f1": {
                            "average": 0.4809146237153287,
                            "stdev": 0.3144015949527643
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6196969696969697,
                            "stdev": 0.31195199885042707
                        },
                        "true_positive_rate": {
                            "average": 0.38030303030303025,
                            "stdev": 0.31195199885042707
                        }
                    }
                }
            }
        },
        "Parameterized Knowledge": {
            "initial_model=gpt-4-0613": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 48,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.7704918032786885,
                            "precision": 0.9791666666666666,
                            "recall": 0.7833333333333333,
                            "f1": 0.8703703703703703,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.21311475409836064,
                            "true_positive_rate": 0.7704918032786885
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7704918032786885,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 0.9791666666666666,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7833333333333333,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8703703703703703,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.21311475409836064,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7704918032786885,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 55,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.8852459016393442,
                            "precision": 0.9818181818181818,
                            "recall": 0.9,
                            "f1": 0.9391304347826087,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.09836065573770492,
                            "true_positive_rate": 0.8852459016393442
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8852459016393442,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 0.9818181818181818,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9391304347826087,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.09836065573770492,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8852459016393442,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.9836065573770492,
                            "precision": 0.9836065573770492,
                            "recall": 1.0,
                            "f1": 0.9917355371900827,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 0.9836065573770492
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9836065573770492,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 0.9836065573770492,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9917355371900827,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9836065573770492,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 21,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.36065573770491804,
                            "precision": 1.0,
                            "recall": 0.35,
                            "f1": 0.5185185185185185,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.639344262295082,
                            "true_positive_rate": 0.3442622950819672
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.36065573770491804,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.35,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5185185185185185,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.639344262295082,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3442622950819672,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 40,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.639344262295082,
                            "precision": 0.975,
                            "recall": 0.65,
                            "f1": 0.78,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.01639344262295082,
                            "false_negative_rate": 0.3442622950819672,
                            "true_positive_rate": 0.639344262295082
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.639344262295082,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 0.975,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.65,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.78,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.3442622950819672,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.639344262295082,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 4,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.08196721311475409,
                            "precision": 1.0,
                            "recall": 0.06666666666666667,
                            "f1": 0.125,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9180327868852459,
                            "true_positive_rate": 0.06557377049180328
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.08196721311475409,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.06666666666666667,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.125,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9180327868852459,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.06557377049180328,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 11,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.19672131147540983,
                            "precision": 1.0,
                            "recall": 0.18333333333333332,
                            "f1": 0.30985915492957744,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8032786885245902,
                            "true_positive_rate": 0.18032786885245902
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.19672131147540983,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.18333333333333332,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.30985915492957744,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8032786885245902,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.18032786885245902,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 15,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.26229508196721313,
                            "precision": 1.0,
                            "recall": 0.25,
                            "f1": 0.4,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7377049180327869,
                            "true_positive_rate": 0.2459016393442623
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.26229508196721313,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7377049180327869,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2459016393442623,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 17,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.29508196721311475,
                            "precision": 1.0,
                            "recall": 0.2833333333333333,
                            "f1": 0.44155844155844154,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7049180327868853,
                            "true_positive_rate": 0.2786885245901639
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.29508196721311475,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2833333333333333,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.44155844155844154,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7049180327868853,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2786885245901639,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 6,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.11475409836065574,
                            "precision": 1.0,
                            "recall": 0.1,
                            "f1": 0.18181818181818182,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8852459016393442,
                            "true_positive_rate": 0.09836065573770492
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.11475409836065574,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.1,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.18181818181818182,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8852459016393442,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.09836065573770492,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 61,
                        "prediction_error_num": 8,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": 0.14754098360655737,
                            "precision": 1.0,
                            "recall": 0.13333333333333333,
                            "f1": 0.23529411764705882,
                            "true_negative_rate": 0.01639344262295082,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8524590163934426,
                            "true_positive_rate": 0.13114754098360656
                        }
                    },
                    "average": {
                        "total_num": 61,
                        "gold_error_num": 60,
                        "metrics": {
                            "accuracy": {
                                "average": 0.14754098360655737,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.13333333333333333,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.23529411764705882,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.01639344262295082,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8524590163934426,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.13114754098360656,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.4307004470938897,
                            "stdev": 0.31324237293546087
                        },
                        "precision": {
                            "average": 0.992690127805627,
                            "stdev": 0.009865342019264494
                        },
                        "recall": {
                            "average": 0.4272727272727273,
                            "stdev": 0.32600061405971226
                        },
                        "f1": {
                            "average": 0.5266622506195309,
                            "stdev": 0.3021498307833101
                        },
                        "true_negative_rate": {
                            "average": 0.010432190760059613,
                            "stdev": 0.007885994965915322
                        },
                        "false_positive_rate": {
                            "average": 0.005961251862891207,
                            "stdev": 0.007885994965915322
                        },
                        "false_negative_rate": {
                            "average": 0.563338301043219,
                            "stdev": 0.3206563416980776
                        },
                        "true_positive_rate": {
                            "average": 0.42026825633383014,
                            "stdev": 0.3206563416980776
                        }
                    }
                }
            },
            "initial_model=meta-llama/Llama-2-70b-chat-hf": {
                "baseline_model=google/gemma-7b-it": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 30,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.3493975903614458,
                            "precision": 0.9666666666666667,
                            "recall": 0.35365853658536583,
                            "f1": 0.5178571428571429,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.6385542168674698,
                            "true_positive_rate": 0.3493975903614458
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3493975903614458,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 0.9666666666666667,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.35365853658536583,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5178571428571429,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6385542168674698,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3493975903614458,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-13b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 61,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.7469879518072289,
                            "precision": 1.0,
                            "recall": 0.7439024390243902,
                            "f1": 0.8531468531468531,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.25301204819277107,
                            "true_positive_rate": 0.7349397590361446
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7469879518072289,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7439024390243902,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8531468531468531,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.25301204819277107,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7349397590361446,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=meta-llama/Llama-2-70b-chat-hf": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.9879518072289156,
                            "precision": 0.9879518072289156,
                            "recall": 1.0,
                            "f1": 0.9939393939393939,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 0.9879518072289156
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9879518072289156,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 0.9879518072289156,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9939393939393939,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9879518072289156,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 11,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.14457831325301204,
                            "precision": 1.0,
                            "recall": 0.13414634146341464,
                            "f1": 0.23655913978494625,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8554216867469879,
                            "true_positive_rate": 0.13253012048192772
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.14457831325301204,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.13414634146341464,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.23655913978494625,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8554216867469879,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.13253012048192772,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-14B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 26,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.30120481927710846,
                            "precision": 0.9615384615384616,
                            "recall": 0.3048780487804878,
                            "f1": 0.46296296296296297,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.012048192771084338,
                            "false_negative_rate": 0.6867469879518072,
                            "true_positive_rate": 0.30120481927710846
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.30120481927710846,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 0.9615384615384616,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3048780487804878,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.46296296296296297,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6867469879518072,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.30120481927710846,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=Qwen/Qwen1.5-72B-Chat": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 13,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.1686746987951807,
                            "precision": 1.0,
                            "recall": 0.15853658536585366,
                            "f1": 0.2736842105263158,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8313253012048193,
                            "true_positive_rate": 0.1566265060240964
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1686746987951807,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.15853658536585366,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.2736842105263158,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8313253012048193,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.1566265060240964,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 2,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.03614457831325301,
                            "precision": 1.0,
                            "recall": 0.024390243902439025,
                            "f1": 0.047619047619047616,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.963855421686747,
                            "true_positive_rate": 0.024096385542168676
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.03614457831325301,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.024390243902439025,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.047619047619047616,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.963855421686747,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.024096385542168676,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=models/gemini-1.0-pro-001": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 3,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.04819277108433735,
                            "precision": 1.0,
                            "recall": 0.036585365853658534,
                            "f1": 0.07058823529411765,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9518072289156626,
                            "true_positive_rate": 0.03614457831325301
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.04819277108433735,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.036585365853658534,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.07058823529411765,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9518072289156626,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.03614457831325301,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=claude-3-opus-20240229": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 8,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.10843373493975904,
                            "precision": 1.0,
                            "recall": 0.0975609756097561,
                            "f1": 0.17777777777777778,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.891566265060241,
                            "true_positive_rate": 0.0963855421686747
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.10843373493975904,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.0975609756097561,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.17777777777777778,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.891566265060241,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.0963855421686747,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0613": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 32,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.39759036144578314,
                            "precision": 1.0,
                            "recall": 0.3902439024390244,
                            "f1": 0.5614035087719298,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6024096385542169,
                            "true_positive_rate": 0.3855421686746988
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.39759036144578314,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3902439024390244,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5614035087719298,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6024096385542169,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3855421686746988,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "cot_instruction_prompt": {
                        "total_num": 83,
                        "prediction_error_num": 47,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": 0.5783132530120482,
                            "precision": 1.0,
                            "recall": 0.573170731707317,
                            "f1": 0.7286821705426356,
                            "true_negative_rate": 0.012048192771084338,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.42168674698795183,
                            "true_positive_rate": 0.5662650602409639
                        }
                    },
                    "average": {
                        "total_num": 83,
                        "gold_error_num": 82,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5783132530120482,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.573170731707317,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7286821705426356,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.012048192771084338,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.42168674698795183,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5662650602409639,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.3515881708652793,
                            "stdev": 0.29300191396237596
                        },
                        "precision": {
                            "average": 0.9923779032212767,
                            "stdev": 0.013805466798081568
                        },
                        "recall": {
                            "average": 0.3470066518847007,
                            "stdev": 0.29882522115751664
                        },
                        "f1": {
                            "average": 0.4476564039293747,
                            "stdev": 0.30298470881590955
                        },
                        "true_negative_rate": {
                            "average": 0.008762322015334063,
                            "stdev": 0.005365804474881004
                        },
                        "false_positive_rate": {
                            "average": 0.003285870755750274,
                            "stdev": 0.005365804474881003
                        },
                        "false_negative_rate": {
                            "average": 0.6451259583789706,
                            "stdev": 0.295224917288149
                        },
                        "true_positive_rate": {
                            "average": 0.3428258488499452,
                            "stdev": 0.295224917288149
                        }
                    }
                }
            }
        }
    }
}