{
    "math_word_problem_generation": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 9,
                        "prediction_error_num": 5,
                        "gold_error_num": 9,
                        "metrics": {
                            "accuracy": 0.5555555555555556,
                            "precision": 1.0,
                            "recall": 0.5555555555555556,
                            "f1": 0.7142857142857143,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4444444444444444,
                            "true_positive_rate": 0.5555555555555556
                        }
                    },
                    "average": {
                        "total_num": 9,
                        "gold_error_num": 9,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5555555555555556,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5555555555555556,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7142857142857143,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4444444444444444,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5555555555555556,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 9,
                        "prediction_error_num": 9,
                        "gold_error_num": 9,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 9,
                        "gold_error_num": 9,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 9,
                        "prediction_error_num": 9,
                        "gold_error_num": 9,
                        "metrics": {
                            "accuracy": 1.0,
                            "precision": 1.0,
                            "recall": 1.0,
                            "f1": 1.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.0,
                            "true_positive_rate": 1.0
                        }
                    },
                    "average": {
                        "total_num": 9,
                        "gold_error_num": 9,
                        "metrics": {
                            "accuracy": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.8518518518518517,
                            "stdev": 0.20951312035156963
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.8518518518518517,
                            "stdev": 0.20951312035156963
                        },
                        "f1": {
                            "average": 0.9047619047619048,
                            "stdev": 0.13468700594029476
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.14814814814814814,
                            "stdev": 0.20951312035156963
                        },
                        "true_positive_rate": {
                            "average": 0.8518518518518517,
                            "stdev": 0.20951312035156963
                        }
                    }
                }
            }
        },
        "Instruction-Following": {
            "initial_model=gpt-4-0613": {
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 24,
                        "prediction_error_num": 15,
                        "gold_error_num": 24,
                        "metrics": {
                            "accuracy": 0.625,
                            "precision": 1.0,
                            "recall": 0.625,
                            "f1": 0.7692307692307693,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.375,
                            "true_positive_rate": 0.625
                        }
                    },
                    "average": {
                        "total_num": 24,
                        "gold_error_num": 24,
                        "metrics": {
                            "accuracy": {
                                "average": 0.625,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.625,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7692307692307693,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.375,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.625,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 24,
                        "prediction_error_num": 23,
                        "gold_error_num": 24,
                        "metrics": {
                            "accuracy": 0.9583333333333334,
                            "precision": 1.0,
                            "recall": 0.9583333333333334,
                            "f1": 0.9787234042553191,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.041666666666666664,
                            "true_positive_rate": 0.9583333333333334
                        }
                    },
                    "average": {
                        "total_num": 24,
                        "gold_error_num": 24,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9583333333333334,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9583333333333334,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9787234042553191,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.041666666666666664,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9583333333333334,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 24,
                        "prediction_error_num": 17,
                        "gold_error_num": 24,
                        "metrics": {
                            "accuracy": 0.7083333333333334,
                            "precision": 1.0,
                            "recall": 0.7083333333333334,
                            "f1": 0.8292682926829268,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2916666666666667,
                            "true_positive_rate": 0.7083333333333334
                        }
                    },
                    "average": {
                        "total_num": 24,
                        "gold_error_num": 24,
                        "metrics": {
                            "accuracy": {
                                "average": 0.7083333333333334,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.7083333333333334,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8292682926829268,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2916666666666667,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.7083333333333334,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.763888888888889,
                            "stdev": 0.14163943093313291
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.763888888888889,
                            "stdev": 0.14163943093313291
                        },
                        "f1": {
                            "average": 0.8590741553896718,
                            "stdev": 0.08808360846813784
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.23611111111111113,
                            "stdev": 0.14163943093313291
                        },
                        "true_positive_rate": {
                            "average": 0.763888888888889,
                            "stdev": 0.14163943093313291
                        }
                    }
                }
            }
        }
    },
    "finegrained_fact_verification": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 14,
                        "prediction_error_num": 13,
                        "gold_error_num": 14,
                        "metrics": {
                            "accuracy": 0.9285714285714286,
                            "precision": 1.0,
                            "recall": 0.9285714285714286,
                            "f1": 0.9629629629629629,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.07142857142857142,
                            "true_positive_rate": 0.9285714285714286
                        }
                    },
                    "average": {
                        "total_num": 14,
                        "gold_error_num": 14,
                        "metrics": {
                            "accuracy": {
                                "average": 0.9285714285714286,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.9285714285714286,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9629629629629629,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.07142857142857142,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.9285714285714286,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 14,
                        "prediction_error_num": 12,
                        "gold_error_num": 14,
                        "metrics": {
                            "accuracy": 0.8571428571428571,
                            "precision": 1.0,
                            "recall": 0.8571428571428571,
                            "f1": 0.9230769230769231,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.14285714285714285,
                            "true_positive_rate": 0.8571428571428571
                        }
                    },
                    "average": {
                        "total_num": 14,
                        "gold_error_num": 14,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.9230769230769231,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.14285714285714285,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 14,
                        "prediction_error_num": 2,
                        "gold_error_num": 14,
                        "metrics": {
                            "accuracy": 0.14285714285714285,
                            "precision": 1.0,
                            "recall": 0.14285714285714285,
                            "f1": 0.25,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8571428571428571,
                            "true_positive_rate": 0.14285714285714285
                        }
                    },
                    "average": {
                        "total_num": 14,
                        "gold_error_num": 14,
                        "metrics": {
                            "accuracy": {
                                "average": 0.14285714285714285,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.14285714285714285,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.25,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8571428571428571,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.14285714285714285,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.6428571428571428,
                            "stdev": 0.35475391489884145
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.6428571428571428,
                            "stdev": 0.35475391489884145
                        },
                        "f1": {
                            "average": 0.7120132953466287,
                            "stdev": 0.32709829088744974
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.35714285714285715,
                            "stdev": 0.35475391489884145
                        },
                        "true_positive_rate": {
                            "average": 0.6428571428571428,
                            "stdev": 0.35475391489884145
                        }
                    }
                }
            }
        },
        "Instruction-Following": {
            "initial_model=gpt-4-0613": {
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 5,
                        "prediction_error_num": 3,
                        "gold_error_num": 5,
                        "metrics": {
                            "accuracy": 0.6,
                            "precision": 1.0,
                            "recall": 0.6,
                            "f1": 0.75,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.4,
                            "true_positive_rate": 0.6
                        }
                    },
                    "average": {
                        "total_num": 5,
                        "gold_error_num": 5,
                        "metrics": {
                            "accuracy": {
                                "average": 0.6,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.6,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.75,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.4,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.6,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 5,
                        "prediction_error_num": 1,
                        "gold_error_num": 5,
                        "metrics": {
                            "accuracy": 0.2,
                            "precision": 1.0,
                            "recall": 0.2,
                            "f1": 0.3333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8,
                            "true_positive_rate": 0.2
                        }
                    },
                    "average": {
                        "total_num": 5,
                        "gold_error_num": 5,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.3333333333333333,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 5,
                        "prediction_error_num": 1,
                        "gold_error_num": 5,
                        "metrics": {
                            "accuracy": 0.2,
                            "precision": 1.0,
                            "recall": 0.2,
                            "f1": 0.3333333333333333,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.8,
                            "true_positive_rate": 0.2
                        }
                    },
                    "average": {
                        "total_num": 5,
                        "gold_error_num": 5,
                        "metrics": {
                            "accuracy": {
                                "average": 0.2,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.2,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.3333333333333333,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.8,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.2,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.3333333333333333,
                            "stdev": 0.18856180831641264
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.3333333333333333,
                            "stdev": 0.18856180831641264
                        },
                        "f1": {
                            "average": 0.47222222222222215,
                            "stdev": 0.19641855032959654
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6666666666666666,
                            "stdev": 0.18856180831641267
                        },
                        "true_positive_rate": {
                            "average": 0.3333333333333333,
                            "stdev": 0.18856180831641264
                        }
                    }
                }
            }
        },
        "Context-Faithfulness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 15,
                        "prediction_error_num": 12,
                        "gold_error_num": 15,
                        "metrics": {
                            "accuracy": 0.8,
                            "precision": 1.0,
                            "recall": 0.8,
                            "f1": 0.8888888888888888,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.2,
                            "true_positive_rate": 0.8
                        }
                    },
                    "average": {
                        "total_num": 15,
                        "gold_error_num": 15,
                        "metrics": {
                            "accuracy": {
                                "average": 0.8,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.8,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.8888888888888888,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.2,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.8,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 15,
                        "prediction_error_num": 5,
                        "gold_error_num": 15,
                        "metrics": {
                            "accuracy": 0.3333333333333333,
                            "precision": 1.0,
                            "recall": 0.3333333333333333,
                            "f1": 0.5,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.6666666666666666,
                            "true_positive_rate": 0.3333333333333333
                        }
                    },
                    "average": {
                        "total_num": 15,
                        "gold_error_num": 15,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3333333333333333,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3333333333333333,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.6666666666666666,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3333333333333333,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 15,
                        "prediction_error_num": 1,
                        "gold_error_num": 15,
                        "metrics": {
                            "accuracy": 0.06666666666666667,
                            "precision": 1.0,
                            "recall": 0.06666666666666667,
                            "f1": 0.125,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9333333333333333,
                            "true_positive_rate": 0.06666666666666667
                        }
                    },
                    "average": {
                        "total_num": 15,
                        "gold_error_num": 15,
                        "metrics": {
                            "accuracy": {
                                "average": 0.06666666666666667,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.06666666666666667,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.125,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9333333333333333,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.06666666666666667,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.39999999999999997,
                            "stdev": 0.3030707043774635
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.39999999999999997,
                            "stdev": 0.3030707043774635
                        },
                        "f1": {
                            "average": 0.5046296296296297,
                            "stdev": 0.31187351468702385
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6,
                            "stdev": 0.3030707043774635
                        },
                        "true_positive_rate": {
                            "average": 0.39999999999999997,
                            "stdev": 0.3030707043774635
                        }
                    }
                }
            }
        }
    },
    "answerability_classification": {
        "Reasoning Correctness": {
            "initial_model=gpt-4-0613": {
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 13,
                        "prediction_error_num": 7,
                        "gold_error_num": 13,
                        "metrics": {
                            "accuracy": 0.5384615384615384,
                            "precision": 1.0,
                            "recall": 0.5384615384615384,
                            "f1": 0.7,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.46153846153846156,
                            "true_positive_rate": 0.5384615384615384
                        }
                    },
                    "average": {
                        "total_num": 13,
                        "gold_error_num": 13,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.46153846153846156,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 13,
                        "prediction_error_num": 7,
                        "gold_error_num": 13,
                        "metrics": {
                            "accuracy": 0.5384615384615384,
                            "precision": 1.0,
                            "recall": 0.5384615384615384,
                            "f1": 0.7,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.46153846153846156,
                            "true_positive_rate": 0.5384615384615384
                        }
                    },
                    "average": {
                        "total_num": 13,
                        "gold_error_num": 13,
                        "metrics": {
                            "accuracy": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.7,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.46153846153846156,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.5384615384615384,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 13,
                        "prediction_error_num": 0,
                        "gold_error_num": 13,
                        "metrics": {
                            "accuracy": 0.0,
                            "precision": 0.0,
                            "recall": 0.0,
                            "f1": 0.0,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 1.0,
                            "true_positive_rate": 0.0
                        }
                    },
                    "average": {
                        "total_num": 13,
                        "gold_error_num": 13,
                        "metrics": {
                            "accuracy": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.358974358974359,
                            "stdev": 0.2538332035028632
                        },
                        "precision": {
                            "average": 0.6666666666666666,
                            "stdev": 0.4714045207910317
                        },
                        "recall": {
                            "average": 0.358974358974359,
                            "stdev": 0.2538332035028632
                        },
                        "f1": {
                            "average": 0.4666666666666666,
                            "stdev": 0.3299831645537222
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.6410256410256411,
                            "stdev": 0.2538332035028632
                        },
                        "true_positive_rate": {
                            "average": 0.358974358974359,
                            "stdev": 0.2538332035028632
                        }
                    }
                }
            }
        },
        "Parameterized Knowledge": {
            "initial_model=gpt-4-0613": {
                "baseline_model=mistralai/Mixtral-8x7B-Instruct-v0.1": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 20,
                        "prediction_error_num": 6,
                        "gold_error_num": 20,
                        "metrics": {
                            "accuracy": 0.3,
                            "precision": 1.0,
                            "recall": 0.3,
                            "f1": 0.46153846153846156,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.7,
                            "true_positive_rate": 0.3
                        }
                    },
                    "average": {
                        "total_num": 20,
                        "gold_error_num": 20,
                        "metrics": {
                            "accuracy": {
                                "average": 0.3,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.3,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.46153846153846156,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.7,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.3,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-3.5-turbo-0125": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 20,
                        "prediction_error_num": 7,
                        "gold_error_num": 20,
                        "metrics": {
                            "accuracy": 0.35,
                            "precision": 1.0,
                            "recall": 0.35,
                            "f1": 0.5185185185185185,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.65,
                            "true_positive_rate": 0.35
                        }
                    },
                    "average": {
                        "total_num": 20,
                        "gold_error_num": 20,
                        "metrics": {
                            "accuracy": {
                                "average": 0.35,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.35,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.5185185185185185,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.65,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.35,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "baseline_model=gpt-4-0125-preview": {
                    "baseline_errordetection_prompt_1": {
                        "total_num": 20,
                        "prediction_error_num": 2,
                        "gold_error_num": 20,
                        "metrics": {
                            "accuracy": 0.1,
                            "precision": 1.0,
                            "recall": 0.1,
                            "f1": 0.18181818181818182,
                            "true_negative_rate": 0.0,
                            "false_positive_rate": 0.0,
                            "false_negative_rate": 0.9,
                            "true_positive_rate": 0.1
                        }
                    },
                    "average": {
                        "total_num": 20,
                        "gold_error_num": 20,
                        "metrics": {
                            "accuracy": {
                                "average": 0.1,
                                "stdev": 0.0
                            },
                            "precision": {
                                "average": 1.0,
                                "stdev": 0.0
                            },
                            "recall": {
                                "average": 0.1,
                                "stdev": 0.0
                            },
                            "f1": {
                                "average": 0.18181818181818182,
                                "stdev": 0.0
                            },
                            "true_negative_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_positive_rate": {
                                "average": 0.0,
                                "stdev": 0.0
                            },
                            "false_negative_rate": {
                                "average": 0.9,
                                "stdev": 0.0
                            },
                            "true_positive_rate": {
                                "average": 0.1,
                                "stdev": 0.0
                            }
                        }
                    }
                },
                "average": {
                    "metrics": {
                        "accuracy": {
                            "average": 0.24999999999999997,
                            "stdev": 0.10801234497346432
                        },
                        "precision": {
                            "average": 1.0,
                            "stdev": 0.0
                        },
                        "recall": {
                            "average": 0.24999999999999997,
                            "stdev": 0.10801234497346432
                        },
                        "f1": {
                            "average": 0.38729172062505396,
                            "stdev": 0.14714213782536265
                        },
                        "true_negative_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_positive_rate": {
                            "average": 0.0,
                            "stdev": 0.0
                        },
                        "false_negative_rate": {
                            "average": 0.75,
                            "stdev": 0.10801234497346433
                        },
                        "true_positive_rate": {
                            "average": 0.24999999999999997,
                            "stdev": 0.10801234497346432
                        }
                    }
                }
            }
        }
    }
}