[
    {
        "test": "limited-list-referents",
        "responder": "qwen-v2.5-14b-instruct",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon monoxide (CO)",
            "Carbon dioxide (CO2)",
            "Nitric oxide (NO)",
            "Nitrogen dioxide (NO2)",
            "Sulfur dioxide (SO2)",
            "Ozone (O3)",
            "Nitrous oxide (N2O)",
            "Phosgene (COCl2)",
            "Hydrogen sulfide (H2S)",
            "Dinitrogen monoxide (N2O)",
            "Dinitrogen trioxide (N2O3)",
            "Peroxynitrite (ONOO-)",
            "Chlorine dioxide (ClO2)",
            "Dihydrogen monoxide (H2O2)*",
            "Sulfuryl fluoride (SO2F2)",
            "Nitrogen trioxide (N2O3)",
            "Hydrogen peroxide (H2O2)*",
            "Nitrogen monoxide (NO)",
            "Sulfur trioxide (SO3)",
            "Oxygen difluoride (OF2)",
            "Dinitrogen pentoxide (N2O5)",
            "Chlorine monoxide (ClO)",
            "Nitrogen dioxide (NO2)",
            "Dihydrogen disulfide (H2S2)*"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon monoxide (CO)",
                "Carbon dioxide (CO2)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Sulfur dioxide (SO2)",
                "Ozone (O3)",
                "Nitrous oxide (N2O)",
                "Phosgene (COCl2)",
                "Hydrogen sulfide (H2S)",
                "Dinitrogen monoxide (N2O)",
                "Dinitrogen trioxide (N2O3)",
                "Peroxynitrite (ONOO-)",
                "Chlorine dioxide (ClO2)",
                "Dihydrogen monoxide (H2O2)*",
                "Sulfuryl fluoride (SO2F2)",
                "Nitrogen trioxide (N2O3)",
                "Hydrogen peroxide (H2O2)*",
                "Nitrogen monoxide (NO)",
                "Sulfur trioxide (SO3)",
                "Oxygen difluoride (OF2)",
                "Dinitrogen pentoxide (N2O5)",
                "Chlorine monoxide (ClO)",
                "Dihydrogen disulfide (H2S2)*"
            ],
            "mismatches": [],
            "true_referents": [
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Chlorine dioxide (ClO2)",
                "Chlorine monoxide (ClO)",
                "Dihydrogen disulfide (H2S2)*",
                "Dihydrogen monoxide (H2O2)*",
                "Dinitrogen monoxide (N2O)",
                "Dinitrogen pentoxide (N2O5)",
                "Dinitrogen trioxide (N2O3)",
                "Hydrogen peroxide (H2O2)*",
                "Hydrogen sulfide (H2S)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrogen monoxide (NO)",
                "Nitrogen trioxide (N2O3)",
                "Nitrous oxide (N2O)",
                "Oxygen difluoride (OF2)",
                "Ozone (O3)",
                "Peroxynitrite (ONOO-)",
                "Phosgene (COCl2)",
                "Sulfur dioxide (SO2)",
                "Sulfur trioxide (SO3)",
                "Sulfuryl fluoride (SO2F2)"
            ],
            "TP": 23,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "phi-v4",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon dioxide (CO2)",
            "Nitrogen dioxide (NO2)",
            "Sulfur dioxide (SO2)",
            "Ozone (O3)",
            "Carbon monoxide (CO)",
            "Nitrous oxide (N2O)",
            "Water vapor (H2O, gaseous)",
            "Dinitrogen trioxide (N2O3)",
            "Dinitrogen tetroxide (N2O4)",
            "Sulfur trioxide (SO3)",
            "Phosphorus pentoxide (P4O10)",
            "Carbon suboxide (C3O2)",
            "Dioxygen difluoride (O2F2)",
            "Diphosphorus pentoxide (P2O5)",
            "Dinitrogen pentoxide (N2O5)",
            "Silicon dioxide (SiO2, gaseous form)",
            "Germanium dioxide (GeO2, gaseous form)",
            "Tin dioxide (SnO2, gaseous form)",
            "Lead dioxide (PbO2, gaseous form)",
            "Selenium dioxide (SeO2)",
            "Tellurium dioxide (TeO2)",
            "Arsenic trioxide (As2O3)",
            "Antimony trioxide (Sb2O3)",
            "Bismuth trioxide (Bi2O3)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon dioxide (CO2)",
                "Nitrogen dioxide (NO2)",
                "Sulfur dioxide (SO2)",
                "Ozone (O3)",
                "Carbon monoxide (CO)",
                "Nitrous oxide (N2O)",
                "Water vapor (H2O, gaseous)",
                "Dinitrogen trioxide (N2O3)",
                "Dinitrogen tetroxide (N2O4)",
                "Sulfur trioxide (SO3)",
                "Phosphorus pentoxide (P4O10)",
                "Carbon suboxide (C3O2)",
                "Dioxygen difluoride (O2F2)",
                "Diphosphorus pentoxide (P2O5)",
                "Dinitrogen pentoxide (N2O5)",
                "Silicon dioxide (SiO2, gaseous form)",
                "Germanium dioxide (GeO2, gaseous form)",
                "Tin dioxide (SnO2, gaseous form)",
                "Lead dioxide (PbO2, gaseous form)",
                "Selenium dioxide (SeO2)",
                "Tellurium dioxide (TeO2)",
                "Arsenic trioxide (As2O3)",
                "Antimony trioxide (Sb2O3)",
                "Bismuth trioxide (Bi2O3)"
            ],
            "mismatches": [],
            "true_referents": [
                "Antimony trioxide (Sb2O3)",
                "Arsenic trioxide (As2O3)",
                "Bismuth trioxide (Bi2O3)",
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Carbon suboxide (C3O2)",
                "Dinitrogen pentoxide (N2O5)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen trioxide (N2O3)",
                "Dioxygen difluoride (O2F2)",
                "Diphosphorus pentoxide (P2O5)",
                "Germanium dioxide (GeO2, gaseous form)",
                "Lead dioxide (PbO2, gaseous form)",
                "Nitrogen dioxide (NO2)",
                "Nitrous oxide (N2O)",
                "Ozone (O3)",
                "Phosphorus pentoxide (P4O10)",
                "Selenium dioxide (SeO2)",
                "Silicon dioxide (SiO2, gaseous form)",
                "Sulfur dioxide (SO2)",
                "Sulfur trioxide (SO3)",
                "Tellurium dioxide (TeO2)",
                "Tin dioxide (SnO2, gaseous form)",
                "Water vapor (H2O, gaseous)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "mistral-small-instruct-24B",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon monoxide (CO)",
            "Carbon dioxide (CO2)",
            "Nitrogen monoxide (NO)",
            "Nitrogen dioxide (NO2)",
            "Sulfur dioxide (SO2)",
            "Sulfur trioxide (SO3)",
            "Dinitrogen monoxide (N2O)",
            "Dinitrogen trioxide (N2O3)",
            "Dinitrogen tetroxide (N2O4)",
            "Dinitrogen pentoxide (N2O5)",
            "Nitrous oxide (N2O)",
            "Ozone (O3)",
            "Phosphorus pentoxide (P2O5)",
            "Phosphorus trioxide (P4O6)",
            "Diphosphorus pentaoxide (P2O5)",
            "Chlorine monoxide (Cl2O)",
            "Chlorine dioxide (ClO2)",
            "Chlorine trioxide (Cl2O3)",
            "Chlorine heptoxide (Cl2O7)",
            "Selenium dioxide (SeO2)",
            "Selenium trioxide (SeO3)",
            "Tellurium dioxide (TeO2)",
            "Tellurium trioxide (TeO3)",
            "Xenon trioxide (XeO3)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon monoxide (CO)",
                "Carbon dioxide (CO2)",
                "Nitrogen monoxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Sulfur dioxide (SO2)",
                "Sulfur trioxide (SO3)",
                "Dinitrogen monoxide (N2O)",
                "Dinitrogen trioxide (N2O3)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen pentoxide (N2O5)",
                "Nitrous oxide (N2O)",
                "Ozone (O3)",
                "Phosphorus pentoxide (P2O5)",
                "Phosphorus trioxide (P4O6)",
                "Diphosphorus pentaoxide (P2O5)",
                "Chlorine monoxide (Cl2O)",
                "Chlorine dioxide (ClO2)",
                "Chlorine trioxide (Cl2O3)",
                "Chlorine heptoxide (Cl2O7)",
                "Selenium dioxide (SeO2)",
                "Selenium trioxide (SeO3)",
                "Tellurium dioxide (TeO2)",
                "Tellurium trioxide (TeO3)",
                "Xenon trioxide (XeO3)"
            ],
            "mismatches": [],
            "true_referents": [
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Chlorine dioxide (ClO2)",
                "Chlorine heptoxide (Cl2O7)",
                "Chlorine monoxide (Cl2O)",
                "Chlorine trioxide (Cl2O3)",
                "Dinitrogen monoxide (N2O)",
                "Dinitrogen pentoxide (N2O5)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen trioxide (N2O3)",
                "Diphosphorus pentaoxide (P2O5)",
                "Nitrogen dioxide (NO2)",
                "Nitrogen monoxide (NO)",
                "Nitrous oxide (N2O)",
                "Ozone (O3)",
                "Phosphorus pentoxide (P2O5)",
                "Phosphorus trioxide (P4O6)",
                "Selenium dioxide (SeO2)",
                "Selenium trioxide (SeO3)",
                "Sulfur dioxide (SO2)",
                "Sulfur trioxide (SO3)",
                "Tellurium dioxide (TeO2)",
                "Tellurium trioxide (TeO3)",
                "Xenon trioxide (XeO3)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gemma-3",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Nitrous Oxide (N\u2082O)",
            "Dinitrogen Monoxide (N\u2082O)",
            "Laughing Gas (N\u2082O)",
            "Carbon Monoxide (CO)",
            "Carbon Dioxide (CO\u2082)",
            "Silicon Dioxide (SiO\u2082 - gaseous form)",
            "Sulfur Dioxide (SO\u2082)",
            "Sulfur Trioxide (SO\u2083)",
            "Phosphorus Pentoxide (P\u2084O\u2081\u2080 - gaseous form)",
            "Dinitrogen Pentoxide (N\u2082O\u2085)",
            "Hydrogen Peroxide (H\u2082O\u2082 - decomposes to O\u2082 gas)",
            "Osmium Tetroxide (OsO\u2084)",
            "Chromium Trioxide (CrO\u2083 - gaseous form)",
            "Molybdenum Hexoxide (MoO\u2086 - gaseous form)",
            "Vanadium Pentoxide (V\u2082O\u2085 - gaseous form)",
            "Tellurium Hexoxide (TeO\u2086 - gaseous form)",
            "Selenium Dioxide (SeO\u2082 - sublimates to gaseous form)",
            "Boron Trioxide (B\u2082O\u2083 - gaseous form)",
            "Xenon Trioxide (XeO\u2083)",
            "Chlorine Dioxide (ClO\u2082)",
            "Bromine Monoxide (BrO)",
            "Iodine Pentoxide (I\u2082O\u2085 - gaseous form)",
            "Nitrogen Dioxide (NO\u2082)",
            "Nitrogen Trioxide (NO\u2083)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Nitrous Oxide (N\u2082O)",
                "Dinitrogen Monoxide (N\u2082O)",
                "Laughing Gas (N\u2082O)",
                "Carbon Monoxide (CO)",
                "Carbon Dioxide (CO\u2082)",
                "Silicon Dioxide (SiO\u2082 - gaseous form)",
                "Sulfur Dioxide (SO\u2082)",
                "Sulfur Trioxide (SO\u2083)",
                "Phosphorus Pentoxide (P\u2084O\u2081\u2080 - gaseous form)",
                "Dinitrogen Pentoxide (N\u2082O\u2085)",
                "Hydrogen Peroxide (H\u2082O\u2082 - decomposes to O\u2082 gas)",
                "Osmium Tetroxide (OsO\u2084)",
                "Chromium Trioxide (CrO\u2083 - gaseous form)",
                "Molybdenum Hexoxide (MoO\u2086 - gaseous form)",
                "Vanadium Pentoxide (V\u2082O\u2085 - gaseous form)",
                "Tellurium Hexoxide (TeO\u2086 - gaseous form)",
                "Selenium Dioxide (SeO\u2082 - sublimates to gaseous form)",
                "Boron Trioxide (B\u2082O\u2083 - gaseous form)",
                "Xenon Trioxide (XeO\u2083)",
                "Chlorine Dioxide (ClO\u2082)",
                "Bromine Monoxide (BrO)",
                "Iodine Pentoxide (I\u2082O\u2085 - gaseous form)",
                "Nitrogen Dioxide (NO\u2082)",
                "Nitrogen Trioxide (NO\u2083)"
            ],
            "mismatches": [],
            "true_referents": [
                "Boron Trioxide (B\u2082O\u2083 - gaseous form)",
                "Bromine Monoxide (BrO)",
                "Carbon Dioxide (CO\u2082)",
                "Carbon Monoxide (CO)",
                "Chlorine Dioxide (ClO\u2082)",
                "Chromium Trioxide (CrO\u2083 - gaseous form)",
                "Dinitrogen Monoxide (N\u2082O)",
                "Dinitrogen Pentoxide (N\u2082O\u2085)",
                "Hydrogen Peroxide (H\u2082O\u2082 - decomposes to O\u2082 gas)",
                "Iodine Pentoxide (I\u2082O\u2085 - gaseous form)",
                "Laughing Gas (N\u2082O)",
                "Molybdenum Hexoxide (MoO\u2086 - gaseous form)",
                "Nitrogen Dioxide (NO\u2082)",
                "Nitrogen Trioxide (NO\u2083)",
                "Nitrous Oxide (N\u2082O)",
                "Osmium Tetroxide (OsO\u2084)",
                "Phosphorus Pentoxide (P\u2084O\u2081\u2080 - gaseous form)",
                "Selenium Dioxide (SeO\u2082 - sublimates to gaseous form)",
                "Silicon Dioxide (SiO\u2082 - gaseous form)",
                "Sulfur Dioxide (SO\u2082)",
                "Sulfur Trioxide (SO\u2083)",
                "Tellurium Hexoxide (TeO\u2086 - gaseous form)",
                "Vanadium Pentoxide (V\u2082O\u2085 - gaseous form)",
                "Xenon Trioxide (XeO\u2083)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4o",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon Dioxide (CO2)",
            "Carbon Monoxide (CO)",
            "Nitric Oxide (NO)",
            "Nitrogen Dioxide (NO2)",
            "Sulfur Dioxide (SO2)",
            "Sulfur Trioxide (SO3)",
            "Dinitrogen Tetroxide (N2O4)",
            "Dinitrogen Pentoxide (N2O5)",
            "Nitrous Oxide (N2O)",
            "Phosphorus Pentoxide (P4O10)",
            "Phosphorus Trioxide (P4O6)",
            "Chlorine Dioxide (ClO2)",
            "Dichlorine Monoxide (Cl2O)",
            "Dichlorine Trioxide (Cl2O3)",
            "Dichlorine Hexoxide (Cl2O6)",
            "Dichlorine Heptoxide (Cl2O7)",
            "Bromine Monoxide (BrO)",
            "Bromine Dioxide (BrO2)",
            "Bromine Trioxide (BrO3)",
            "Iodine Pentoxide (I2O5)",
            "Iodine Heptoxide (I2O7)",
            "Vanadium Pentoxide (V2O5)",
            "Chromium Trioxide (CrO3)",
            "Manganese Heptoxide (Mn2O7)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon Dioxide (CO2)",
                "Carbon Monoxide (CO)",
                "Nitric Oxide (NO)",
                "Nitrogen Dioxide (NO2)",
                "Sulfur Dioxide (SO2)",
                "Sulfur Trioxide (SO3)",
                "Dinitrogen Tetroxide (N2O4)",
                "Dinitrogen Pentoxide (N2O5)",
                "Nitrous Oxide (N2O)",
                "Phosphorus Pentoxide (P4O10)",
                "Phosphorus Trioxide (P4O6)",
                "Chlorine Dioxide (ClO2)",
                "Dichlorine Monoxide (Cl2O)",
                "Dichlorine Trioxide (Cl2O3)",
                "Dichlorine Hexoxide (Cl2O6)",
                "Dichlorine Heptoxide (Cl2O7)",
                "Bromine Monoxide (BrO)",
                "Bromine Dioxide (BrO2)",
                "Bromine Trioxide (BrO3)",
                "Iodine Pentoxide (I2O5)",
                "Iodine Heptoxide (I2O7)",
                "Vanadium Pentoxide (V2O5)",
                "Chromium Trioxide (CrO3)",
                "Manganese Heptoxide (Mn2O7)"
            ],
            "mismatches": [],
            "true_referents": [
                "Bromine Dioxide (BrO2)",
                "Bromine Monoxide (BrO)",
                "Bromine Trioxide (BrO3)",
                "Carbon Dioxide (CO2)",
                "Carbon Monoxide (CO)",
                "Chlorine Dioxide (ClO2)",
                "Chromium Trioxide (CrO3)",
                "Dichlorine Heptoxide (Cl2O7)",
                "Dichlorine Hexoxide (Cl2O6)",
                "Dichlorine Monoxide (Cl2O)",
                "Dichlorine Trioxide (Cl2O3)",
                "Dinitrogen Pentoxide (N2O5)",
                "Dinitrogen Tetroxide (N2O4)",
                "Iodine Heptoxide (I2O7)",
                "Iodine Pentoxide (I2O5)",
                "Manganese Heptoxide (Mn2O7)",
                "Nitric Oxide (NO)",
                "Nitrogen Dioxide (NO2)",
                "Nitrous Oxide (N2O)",
                "Phosphorus Pentoxide (P4O10)",
                "Phosphorus Trioxide (P4O6)",
                "Sulfur Dioxide (SO2)",
                "Sulfur Trioxide (SO3)",
                "Vanadium Pentoxide (V2O5)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4o-mini",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon dioxide (CO2)",
            "Nitrogen dioxide (NO2)",
            "Sulfur dioxide (SO2)",
            "Ozone (O3)",
            "Phosphorus pentoxide (P2O5)",
            "Hydrogen peroxide (H2O2)",
            "Nitrous oxide (N2O)",
            "Carbon monoxide (CO)",
            "Silicon dioxide (SiO2)",
            "Arsenic trioxide (As2O3)",
            "Boron trioxide (B2O3)",
            "Manganese dioxide (MnO2)",
            "Lead(II) oxide (PbO)",
            "Zinc oxide (ZnO)",
            "Tin(IV) oxide (SnO2)",
            "Chromium trioxide (CrO3)",
            "Vanadium pentoxide (V2O5)",
            "Cobalt(II) oxide (CoO)",
            "Copper(II) oxide (CuO)",
            "Iron(III) oxide (Fe2O3)",
            "Mercury(II) oxide (HgO)",
            "Tungsten trioxide (WO3)",
            "Titanium dioxide (TiO2)",
            "Selenium dioxide (SeO2)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon dioxide (CO2)",
                "Nitrogen dioxide (NO2)",
                "Sulfur dioxide (SO2)",
                "Ozone (O3)",
                "Phosphorus pentoxide (P2O5)",
                "Hydrogen peroxide (H2O2)",
                "Nitrous oxide (N2O)",
                "Carbon monoxide (CO)",
                "Silicon dioxide (SiO2)",
                "Arsenic trioxide (As2O3)",
                "Boron trioxide (B2O3)",
                "Manganese dioxide (MnO2)",
                "Lead(II) oxide (PbO)",
                "Zinc oxide (ZnO)",
                "Tin(IV) oxide (SnO2)",
                "Chromium trioxide (CrO3)",
                "Vanadium pentoxide (V2O5)",
                "Cobalt(II) oxide (CoO)",
                "Copper(II) oxide (CuO)",
                "Iron(III) oxide (Fe2O3)",
                "Mercury(II) oxide (HgO)",
                "Tungsten trioxide (WO3)",
                "Titanium dioxide (TiO2)",
                "Selenium dioxide (SeO2)"
            ],
            "mismatches": [],
            "true_referents": [
                "Arsenic trioxide (As2O3)",
                "Boron trioxide (B2O3)",
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Chromium trioxide (CrO3)",
                "Cobalt(II) oxide (CoO)",
                "Copper(II) oxide (CuO)",
                "Hydrogen peroxide (H2O2)",
                "Iron(III) oxide (Fe2O3)",
                "Lead(II) oxide (PbO)",
                "Manganese dioxide (MnO2)",
                "Mercury(II) oxide (HgO)",
                "Nitrogen dioxide (NO2)",
                "Nitrous oxide (N2O)",
                "Ozone (O3)",
                "Phosphorus pentoxide (P2O5)",
                "Selenium dioxide (SeO2)",
                "Silicon dioxide (SiO2)",
                "Sulfur dioxide (SO2)",
                "Tin(IV) oxide (SnO2)",
                "Titanium dioxide (TiO2)",
                "Tungsten trioxide (WO3)",
                "Vanadium pentoxide (V2O5)",
                "Zinc oxide (ZnO)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4-32k",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Nitric Oxide (NO)",
            "Nitrogen Dioxide (NO2)",
            "Dinitrogen Trioxide (N2O3)",
            "Dinitrogen Tetroxide (N2O4)",
            "Dinitrogen Pentoxide (N2O5)",
            "Sulfur Dioxide (SO2)",
            "Sulfur Trioxide (SO3)",
            "Carbon Dioxide (CO2)",
            "Carbon Monoxide (CO)",
            "Oxygen Difluoride (OF2)",
            "Oxygen Monofluoride (OF)",
            "Ozone (O3)",
            "Water Vapor (H2O)",
            "Sulfur Hexafluoride (SF6)",
            "Nitrous Oxide (N2O)",
            "Phosphorus Pentoxide (P4O10)",
            "Selenium Dioxide (SeO2)",
            "Selenium Trioxide (SeO3)",
            "Tellurium Dioxide (TeO2)",
            "Tellurium Trioxide (TeO3)",
            "Polonium Dioxide (PoO2)",
            "Polonium Trioxide (PoO3)",
            "Radon Trioxide (RnO3)",
            "Radon Tetroxide (RnO4)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Nitric Oxide (NO)",
                "Nitrogen Dioxide (NO2)",
                "Dinitrogen Trioxide (N2O3)",
                "Dinitrogen Tetroxide (N2O4)",
                "Dinitrogen Pentoxide (N2O5)",
                "Sulfur Dioxide (SO2)",
                "Sulfur Trioxide (SO3)",
                "Carbon Dioxide (CO2)",
                "Carbon Monoxide (CO)",
                "Oxygen Difluoride (OF2)",
                "Oxygen Monofluoride (OF)",
                "Ozone (O3)",
                "Water Vapor (H2O)",
                "Sulfur Hexafluoride (SF6)",
                "Nitrous Oxide (N2O)",
                "Phosphorus Pentoxide (P4O10)",
                "Selenium Dioxide (SeO2)",
                "Selenium Trioxide (SeO3)",
                "Tellurium Dioxide (TeO2)",
                "Tellurium Trioxide (TeO3)",
                "Polonium Dioxide (PoO2)",
                "Polonium Trioxide (PoO3)",
                "Radon Trioxide (RnO3)",
                "Radon Tetroxide (RnO4)"
            ],
            "mismatches": [],
            "true_referents": [
                "Carbon Dioxide (CO2)",
                "Carbon Monoxide (CO)",
                "Dinitrogen Pentoxide (N2O5)",
                "Dinitrogen Tetroxide (N2O4)",
                "Dinitrogen Trioxide (N2O3)",
                "Nitric Oxide (NO)",
                "Nitrogen Dioxide (NO2)",
                "Nitrous Oxide (N2O)",
                "Oxygen Difluoride (OF2)",
                "Oxygen Monofluoride (OF)",
                "Ozone (O3)",
                "Phosphorus Pentoxide (P4O10)",
                "Polonium Dioxide (PoO2)",
                "Polonium Trioxide (PoO3)",
                "Radon Tetroxide (RnO4)",
                "Radon Trioxide (RnO3)",
                "Selenium Dioxide (SeO2)",
                "Selenium Trioxide (SeO3)",
                "Sulfur Dioxide (SO2)",
                "Sulfur Hexafluoride (SF6)",
                "Sulfur Trioxide (SO3)",
                "Tellurium Dioxide (TeO2)",
                "Tellurium Trioxide (TeO3)",
                "Water Vapor (H2O)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-35-turbo-16k",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon dioxide (CO2)",
            "Nitrous oxide (N2O)",
            "Sulfur dioxide (SO2)",
            "Nitric oxide (NO)",
            "Nitrogen dioxide (NO2)",
            "Ozone (O3)",
            "Carbon monoxide (CO)",
            "Sulfur trioxide (SO3)",
            "Dinitrogen tetroxide (N2O4)",
            "Dinitrogen pentoxide (N2O5)",
            "Carbon suboxide (C3O2)",
            "Sulfur hexafluoride (SF6)",
            "Phosphorus pentoxide (P2O5)",
            "Chlorine dioxide (ClO2)",
            "Sulfur trioxide dimer (S6O6)",
            "Phosphorus pentasulfide (P2S5)",
            "Dinitrogen trioxide (N2O3)",
            "Dinitrogen tetraoxide (N2O4)",
            "Sulfur trioxide hexamer (S6O6)",
            "Nitrogen pentoxide (N2O5)",
            "Phosphorus pentachloride (PCl5)",
            "Sulfur trioxide trimer (S3O9)",
            "Nitrogen trioxide (NO3)",
            "Sulfur dioxide dimer (S2O4)",
            "Sulfur trioxide dimer (S6O12)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon dioxide (CO2)",
                "Nitrous oxide (N2O)",
                "Sulfur dioxide (SO2)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Ozone (O3)",
                "Carbon monoxide (CO)",
                "Sulfur trioxide (SO3)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen pentoxide (N2O5)",
                "Carbon suboxide (C3O2)",
                "Sulfur hexafluoride (SF6)",
                "Phosphorus pentoxide (P2O5)",
                "Chlorine dioxide (ClO2)",
                "Sulfur trioxide dimer (S6O6)",
                "Phosphorus pentasulfide (P2S5)",
                "Dinitrogen trioxide (N2O3)",
                "Sulfur trioxide hexamer (S6O6)",
                "Nitrogen pentoxide (N2O5)",
                "Phosphorus pentachloride (PCl5)",
                "Sulfur trioxide trimer (S3O9)",
                "Nitrogen trioxide (NO3)",
                "Sulfur dioxide dimer (S2O4)"
            ],
            "mismatches": [
                "Dinitrogen tetraoxide (N2O4)",
                "Sulfur trioxide dimer (S6O12)"
            ],
            "true_referents": [
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Carbon suboxide (C3O2)",
                "Chlorine dioxide (ClO2)",
                "Dinitrogen pentoxide (N2O5)",
                "Dinitrogen tetraoxide (N2O4)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen trioxide (N2O3)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrogen pentoxide (N2O5)",
                "Nitrogen trioxide (NO3)",
                "Nitrous oxide (N2O)",
                "Ozone (O3)",
                "Phosphorus pentachloride (PCl5)",
                "Phosphorus pentasulfide (P2S5)",
                "Phosphorus pentoxide (P2O5)",
                "Sulfur dioxide (SO2)",
                "Sulfur dioxide dimer (S2O4)",
                "Sulfur hexafluoride (SF6)",
                "Sulfur trioxide (SO3)",
                "Sulfur trioxide dimer (S6O6)",
                "Sulfur trioxide hexamer (S6O6)",
                "Sulfur trioxide trimer (S3O9)"
            ],
            "TP": 23,
            "FP": 2,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "o1-mini",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Water (H\u2082O)",
            "Carbon dioxide (CO\u2082)",
            "Carbon monoxide (CO)",
            "Nitric oxide (NO)",
            "Nitrogen dioxide (NO\u2082)",
            "Nitrous oxide (N\u2082O)",
            "Dinitrogen trioxide (N\u2082O\u2083)",
            "Dinitrogen tetroxide (N\u2082O\u2084)",
            "Sulfur dioxide (SO\u2082)",
            "Sulfur trioxide (SO\u2083)",
            "Ozone (O\u2083)",
            "Chlorine monoxide (ClO)",
            "Chlorine dioxide (ClO\u2082)",
            "Dichlorine trioxide (Cl\u2082O\u2083)",
            "Dichlorine heptoxide (Cl\u2082O\u2087)",
            "Bromine monoxide (BrO)",
            "Bromine dioxide (BrO\u2082)",
            "Bromine trioxide (Br\u2082O\u2083)",
            "Iodine monoxide (IO)",
            "Iodine dioxide (IO\u2082)",
            "Iodine trioxide (I\u2082O\u2083)",
            "Xenon trioxide (XeO\u2083)",
            "Xenon tetroxide (XeO\u2084)",
            "Hydrogen peroxide (H\u2082O\u2082)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Water (H\u2082O)",
                "Carbon dioxide (CO\u2082)",
                "Carbon monoxide (CO)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO\u2082)",
                "Nitrous oxide (N\u2082O)",
                "Dinitrogen trioxide (N\u2082O\u2083)",
                "Dinitrogen tetroxide (N\u2082O\u2084)",
                "Sulfur dioxide (SO\u2082)",
                "Sulfur trioxide (SO\u2083)",
                "Ozone (O\u2083)",
                "Chlorine monoxide (ClO)",
                "Chlorine dioxide (ClO\u2082)",
                "Dichlorine trioxide (Cl\u2082O\u2083)",
                "Dichlorine heptoxide (Cl\u2082O\u2087)",
                "Bromine monoxide (BrO)",
                "Bromine dioxide (BrO\u2082)",
                "Bromine trioxide (Br\u2082O\u2083)",
                "Iodine monoxide (IO)",
                "Iodine dioxide (IO\u2082)",
                "Iodine trioxide (I\u2082O\u2083)",
                "Xenon trioxide (XeO\u2083)",
                "Xenon tetroxide (XeO\u2084)",
                "Hydrogen peroxide (H\u2082O\u2082)"
            ],
            "mismatches": [],
            "true_referents": [
                "Bromine dioxide (BrO\u2082)",
                "Bromine monoxide (BrO)",
                "Bromine trioxide (Br\u2082O\u2083)",
                "Carbon dioxide (CO\u2082)",
                "Carbon monoxide (CO)",
                "Chlorine dioxide (ClO\u2082)",
                "Chlorine monoxide (ClO)",
                "Dichlorine heptoxide (Cl\u2082O\u2087)",
                "Dichlorine trioxide (Cl\u2082O\u2083)",
                "Dinitrogen tetroxide (N\u2082O\u2084)",
                "Dinitrogen trioxide (N\u2082O\u2083)",
                "Hydrogen peroxide (H\u2082O\u2082)",
                "Iodine dioxide (IO\u2082)",
                "Iodine monoxide (IO)",
                "Iodine trioxide (I\u2082O\u2083)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO\u2082)",
                "Nitrous oxide (N\u2082O)",
                "Ozone (O\u2083)",
                "Sulfur dioxide (SO\u2082)",
                "Sulfur trioxide (SO\u2083)",
                "Water (H\u2082O)",
                "Xenon tetroxide (XeO\u2084)",
                "Xenon trioxide (XeO\u2083)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-5-sonnet-v1:0",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon dioxide (CO2)",
            "Nitric oxide (NO)",
            "Nitrogen dioxide (NO2)",
            "Nitrous oxide (N2O)",
            "Sulfur dioxide (SO2)",
            "Ozone (O3)",
            "Carbon monoxide (CO)",
            "Hydrogen peroxide (H2O2)",
            "Nitrogen trioxide (N2O3)",
            "Dinitrogen tetroxide (N2O4)",
            "Nitrogen pentoxide (N2O5)",
            "Sulfur trioxide (SO3)",
            "Chlorine dioxide (ClO2)",
            "Nitric acid vapor (HNO3)",
            "Hydrogen sulfide (H2S)",
            "Carbonyl sulfide (COS)",
            "Sulfur hexafluoride (SF6)",
            "Xenon trioxide (XeO3)",
            "Xenon tetroxide (XeO4)",
            "Dinitrogen pentoxide (N2O5)",
            "Dichlorine monoxide (Cl2O)",
            "Chlorine monoxide (ClO)",
            "Bromine monoxide (BrO)",
            "Iodine monoxide (IO)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon dioxide (CO2)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrous oxide (N2O)",
                "Sulfur dioxide (SO2)",
                "Ozone (O3)",
                "Carbon monoxide (CO)",
                "Hydrogen peroxide (H2O2)",
                "Nitrogen trioxide (N2O3)",
                "Dinitrogen tetroxide (N2O4)",
                "Nitrogen pentoxide (N2O5)",
                "Sulfur trioxide (SO3)",
                "Chlorine dioxide (ClO2)",
                "Nitric acid vapor (HNO3)",
                "Hydrogen sulfide (H2S)",
                "Carbonyl sulfide (COS)",
                "Sulfur hexafluoride (SF6)",
                "Xenon trioxide (XeO3)",
                "Xenon tetroxide (XeO4)",
                "Dinitrogen pentoxide (N2O5)",
                "Dichlorine monoxide (Cl2O)",
                "Chlorine monoxide (ClO)",
                "Bromine monoxide (BrO)",
                "Iodine monoxide (IO)"
            ],
            "mismatches": [],
            "true_referents": [
                "Bromine monoxide (BrO)",
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Carbonyl sulfide (COS)",
                "Chlorine dioxide (ClO2)",
                "Chlorine monoxide (ClO)",
                "Dichlorine monoxide (Cl2O)",
                "Dinitrogen pentoxide (N2O5)",
                "Dinitrogen tetroxide (N2O4)",
                "Hydrogen peroxide (H2O2)",
                "Hydrogen sulfide (H2S)",
                "Iodine monoxide (IO)",
                "Nitric acid vapor (HNO3)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrogen pentoxide (N2O5)",
                "Nitrogen trioxide (N2O3)",
                "Nitrous oxide (N2O)",
                "Ozone (O3)",
                "Sulfur dioxide (SO2)",
                "Sulfur hexafluoride (SF6)",
                "Sulfur trioxide (SO3)",
                "Xenon tetroxide (XeO4)",
                "Xenon trioxide (XeO3)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-opus-v1:0",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon monoxide (CO)",
            "Carbon dioxide (CO2)",
            "Nitric oxide (NO)",
            "Nitrogen dioxide (NO2)",
            "Nitrous oxide (N2O)",
            "Sulfur dioxide (SO2)",
            "Sulfur trioxide (SO3)",
            "Hydrogen sulfide (H2S)",
            "Ammonia (NH3)",
            "Water vapor (H2O)",
            "Ozone (O3)",
            "Chlorine monoxide (ClO)",
            "Chlorine dioxide (ClO2)",
            "Bromine monoxide (BrO)",
            "Iodine monoxide (IO)",
            "Dinitrogen trioxide (N2O3)",
            "Dinitrogen tetroxide (N2O4)",
            "Dinitrogen pentoxide (N2O5)",
            "Phosphorus trioxide (P4O6)",
            "Phosphorus pentoxide (P4O10)",
            "Arsenic trioxide (As4O6)",
            "Selenium dioxide (SeO2)",
            "Tellurium dioxide (TeO2)",
            "Xenon trioxide (XeO3)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon monoxide (CO)",
                "Carbon dioxide (CO2)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrous oxide (N2O)",
                "Sulfur dioxide (SO2)",
                "Sulfur trioxide (SO3)",
                "Hydrogen sulfide (H2S)",
                "Ammonia (NH3)",
                "Water vapor (H2O)",
                "Ozone (O3)",
                "Chlorine monoxide (ClO)",
                "Chlorine dioxide (ClO2)",
                "Bromine monoxide (BrO)",
                "Iodine monoxide (IO)",
                "Dinitrogen trioxide (N2O3)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen pentoxide (N2O5)",
                "Phosphorus trioxide (P4O6)",
                "Phosphorus pentoxide (P4O10)",
                "Arsenic trioxide (As4O6)",
                "Selenium dioxide (SeO2)",
                "Tellurium dioxide (TeO2)",
                "Xenon trioxide (XeO3)"
            ],
            "mismatches": [],
            "true_referents": [
                "Ammonia (NH3)",
                "Arsenic trioxide (As4O6)",
                "Bromine monoxide (BrO)",
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Chlorine dioxide (ClO2)",
                "Chlorine monoxide (ClO)",
                "Dinitrogen pentoxide (N2O5)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen trioxide (N2O3)",
                "Hydrogen sulfide (H2S)",
                "Iodine monoxide (IO)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrous oxide (N2O)",
                "Ozone (O3)",
                "Phosphorus pentoxide (P4O10)",
                "Phosphorus trioxide (P4O6)",
                "Selenium dioxide (SeO2)",
                "Sulfur dioxide (SO2)",
                "Sulfur trioxide (SO3)",
                "Tellurium dioxide (TeO2)",
                "Water vapor (H2O)",
                "Xenon trioxide (XeO3)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-sonnet-v1:0",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon dioxide (CO2)",
            "Nitrous oxide (N2O)",
            "Sulfur dioxide (SO2)",
            "Ozone (O3)",
            "Nitrogen oxide (NO)",
            "Nitrogen dioxide (NO2)",
            "Dinitrogen pentoxide (N2O5)",
            "Nitric oxide (NO)",
            "Carbon monoxide (CO)",
            "Sulfur trioxide (SO3)",
            "Dinitrogen trioxide (N2O3)",
            "Nitrogen trioxide (N2O3)",
            "Dinitrogen tetroxide (N2O4)",
            "Nitrogen tetroxide (N2O4)",
            "Dioxygen (O2)",
            "Ozone (O3)",
            "Sulfur hexafluoride (SF6)",
            "Xenon trioxide (XeO3)",
            "Osmium tetroxide (OsO4)",
            "Ruthenium tetroxide (RuO4)",
            "Iridium tetroxide (IrO4)",
            "Rhenium trioxide (Re2O7)",
            "Tungsten trioxide (WO3)",
            "Molybdenum trioxide (MoO3)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon dioxide (CO2)",
                "Nitrous oxide (N2O)",
                "Sulfur dioxide (SO2)",
                "Ozone (O3)",
                "Nitrogen oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Dinitrogen pentoxide (N2O5)",
                "Nitric oxide (NO)",
                "Carbon monoxide (CO)",
                "Sulfur trioxide (SO3)",
                "Dinitrogen trioxide (N2O3)",
                "Nitrogen trioxide (N2O3)",
                "Dinitrogen tetroxide (N2O4)",
                "Nitrogen tetroxide (N2O4)",
                "Dioxygen (O2)",
                "Sulfur hexafluoride (SF6)",
                "Xenon trioxide (XeO3)",
                "Osmium tetroxide (OsO4)",
                "Ruthenium tetroxide (RuO4)",
                "Iridium tetroxide (IrO4)",
                "Rhenium trioxide (Re2O7)",
                "Tungsten trioxide (WO3)",
                "Molybdenum trioxide (MoO3)"
            ],
            "mismatches": [],
            "true_referents": [
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Dinitrogen pentoxide (N2O5)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen trioxide (N2O3)",
                "Dioxygen (O2)",
                "Iridium tetroxide (IrO4)",
                "Molybdenum trioxide (MoO3)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrogen oxide (NO)",
                "Nitrogen tetroxide (N2O4)",
                "Nitrogen trioxide (N2O3)",
                "Nitrous oxide (N2O)",
                "Osmium tetroxide (OsO4)",
                "Ozone (O3)",
                "Rhenium trioxide (Re2O7)",
                "Ruthenium tetroxide (RuO4)",
                "Sulfur dioxide (SO2)",
                "Sulfur hexafluoride (SF6)",
                "Sulfur trioxide (SO3)",
                "Tungsten trioxide (WO3)",
                "Xenon trioxide (XeO3)"
            ],
            "TP": 23,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-haiku-v1:0",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon dioxide (CO2)",
            "Nitric oxide (NO)",
            "Nitrogen dioxide (NO2)",
            "Sulfur dioxide (SO2)",
            "Ozone (O3)",
            "Dinitrogen monoxide (N2O)",
            "Nitrogen trioxide (N2O3)",
            "Nitrogen tetroxide (N2O4)",
            "Dinitrogen pentoxide (N2O5)",
            "Phosphorus trioxide (P2O3)",
            "Phosphorus pentoxide (P2O5)",
            "Arsenic trioxide (As2O3)",
            "Arsenic pentoxide (As2O5)",
            "Antimony trioxide (Sb2O3)",
            "Antimony pentoxide (Sb2O5)",
            "Bismuth trioxide (Bi2O3)",
            "Bismuth pentoxide (Bi2O5)",
            "Selenium dioxide (SeO2)",
            "Tellurium dioxide (TeO2)",
            "Polonium dioxide (PoO2)",
            "Radon oxide (RnO)",
            "Xenon trioxide (XeO3)",
            "Xenon tetroxide (XeO4)",
            "Krypton monoxide (KrO)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon dioxide (CO2)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Sulfur dioxide (SO2)",
                "Ozone (O3)",
                "Dinitrogen monoxide (N2O)",
                "Nitrogen trioxide (N2O3)",
                "Nitrogen tetroxide (N2O4)",
                "Dinitrogen pentoxide (N2O5)",
                "Phosphorus trioxide (P2O3)",
                "Phosphorus pentoxide (P2O5)",
                "Arsenic trioxide (As2O3)",
                "Arsenic pentoxide (As2O5)",
                "Antimony trioxide (Sb2O3)",
                "Antimony pentoxide (Sb2O5)",
                "Bismuth trioxide (Bi2O3)",
                "Bismuth pentoxide (Bi2O5)",
                "Selenium dioxide (SeO2)",
                "Tellurium dioxide (TeO2)",
                "Polonium dioxide (PoO2)",
                "Radon oxide (RnO)",
                "Xenon trioxide (XeO3)",
                "Xenon tetroxide (XeO4)",
                "Krypton monoxide (KrO)"
            ],
            "mismatches": [],
            "true_referents": [
                "Antimony pentoxide (Sb2O5)",
                "Antimony trioxide (Sb2O3)",
                "Arsenic pentoxide (As2O5)",
                "Arsenic trioxide (As2O3)",
                "Bismuth pentoxide (Bi2O5)",
                "Bismuth trioxide (Bi2O3)",
                "Carbon dioxide (CO2)",
                "Dinitrogen monoxide (N2O)",
                "Dinitrogen pentoxide (N2O5)",
                "Krypton monoxide (KrO)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrogen tetroxide (N2O4)",
                "Nitrogen trioxide (N2O3)",
                "Ozone (O3)",
                "Phosphorus pentoxide (P2O5)",
                "Phosphorus trioxide (P2O3)",
                "Polonium dioxide (PoO2)",
                "Radon oxide (RnO)",
                "Selenium dioxide (SeO2)",
                "Sulfur dioxide (SO2)",
                "Tellurium dioxide (TeO2)",
                "Xenon tetroxide (XeO4)",
                "Xenon trioxide (XeO3)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "meta.llama3-8b-instruct-v1:0",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Carbon dioxide (CO2)",
            "Nitrous oxide (N2O)",
            "Ozone (O3)",
            "Sulfur dioxide (SO2)",
            "Nitrogen dioxide (NO2)",
            "Oxygen (O2)",
            "Carbon monoxide (CO)",
            "Hydrogen peroxide (H2O2)",
            "Nitric oxide (NO)",
            "Ammonia (NH3)",
            "Phosphorus pentoxide (P2O5)",
            "Silicon dioxide (SiO2)",
            "Titanium dioxide (TiO2)",
            "Chromium trioxide (Cr2O3)",
            "Molybdenum trioxide (MoO3)",
            "Tungsten trioxide (WO3)",
            "Manganese dioxide (MnO2)",
            "Copper oxide (CuO)",
            "Zinc oxide (ZnO)",
            "Cadmium oxide (CdO)",
            "Mercury oxide (HgO)",
            "Arsenic trioxide (As2O3)",
            "Antimony trioxide (Sb2O3)",
            "Bismuth oxide (Bi2O3)",
            "Lead dioxide (PbO2)",
            "Selenium dioxide (SeO2)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Carbon dioxide (CO2)",
                "Nitrous oxide (N2O)",
                "Ozone (O3)",
                "Sulfur dioxide (SO2)",
                "Nitrogen dioxide (NO2)",
                "Oxygen (O2)",
                "Carbon monoxide (CO)",
                "Hydrogen peroxide (H2O2)",
                "Nitric oxide (NO)",
                "Ammonia (NH3)",
                "Phosphorus pentoxide (P2O5)",
                "Silicon dioxide (SiO2)",
                "Titanium dioxide (TiO2)",
                "Chromium trioxide (Cr2O3)",
                "Molybdenum trioxide (MoO3)",
                "Tungsten trioxide (WO3)",
                "Manganese dioxide (MnO2)",
                "Copper oxide (CuO)",
                "Zinc oxide (ZnO)",
                "Cadmium oxide (CdO)",
                "Mercury oxide (HgO)",
                "Arsenic trioxide (As2O3)",
                "Antimony trioxide (Sb2O3)",
                "Bismuth oxide (Bi2O3)"
            ],
            "mismatches": [
                "Lead dioxide (PbO2)",
                "Selenium dioxide (SeO2)"
            ],
            "true_referents": [
                "Ammonia (NH3)",
                "Antimony trioxide (Sb2O3)",
                "Arsenic trioxide (As2O3)",
                "Bismuth oxide (Bi2O3)",
                "Cadmium oxide (CdO)",
                "Carbon dioxide (CO2)",
                "Carbon monoxide (CO)",
                "Chromium trioxide (Cr2O3)",
                "Copper oxide (CuO)",
                "Hydrogen peroxide (H2O2)",
                "Manganese dioxide (MnO2)",
                "Mercury oxide (HgO)",
                "Molybdenum trioxide (MoO3)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrous oxide (N2O)",
                "Oxygen (O2)",
                "Ozone (O3)",
                "Phosphorus pentoxide (P2O5)",
                "Silicon dioxide (SiO2)",
                "Sulfur dioxide (SO2)",
                "Titanium dioxide (TiO2)",
                "Tungsten trioxide (WO3)",
                "Zinc oxide (ZnO)"
            ],
            "TP": 24,
            "FP": 2,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "meta.llama3-70b-instruct-v1:0",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            "Nitric oxide (NO)",
            "Nitrous oxide (N2O)",
            "Carbon dioxide (CO2)",
            "Sulfur dioxide (SO2)",
            "Nitrogen dioxide (NO2)",
            "Ozone (O3)",
            "Sulfur trioxide (SO3)",
            "Xenon oxide (XeO3)",
            "Dinitrogen monoxide (N2O)",
            "Dinitrogen tetroxide (N2O4)",
            "Dinitrogen pentoxide (N2O5)",
            "Chlorine dioxide (ClO2)",
            "Bromine dioxide (BrO2)",
            "Iodine dioxide (IO2)",
            "Silicon dioxide (SiO2)",
            "Germanium dioxide (GeO2)",
            "Tin dioxide (SnO2)",
            "Lead dioxide (PbO2)",
            "Manganese dioxide (MnO2)",
            "Chromium dioxide (CrO2)",
            "Molybdenum dioxide (MoO2)",
            "Tungsten dioxide (WO2)",
            "Rhenium dioxide (ReO2)",
            "Osmium dioxide (OsO2)",
            "Iridium dioxide (IrO2)",
            "Platinum dioxide (PtO2)",
            "Gold dioxide (AuO2)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Nitric oxide (NO)",
                "Nitrous oxide (N2O)",
                "Carbon dioxide (CO2)",
                "Sulfur dioxide (SO2)",
                "Nitrogen dioxide (NO2)",
                "Ozone (O3)",
                "Sulfur trioxide (SO3)",
                "Xenon oxide (XeO3)",
                "Dinitrogen monoxide (N2O)",
                "Dinitrogen tetroxide (N2O4)",
                "Dinitrogen pentoxide (N2O5)",
                "Chlorine dioxide (ClO2)",
                "Bromine dioxide (BrO2)",
                "Iodine dioxide (IO2)",
                "Silicon dioxide (SiO2)",
                "Germanium dioxide (GeO2)",
                "Tin dioxide (SnO2)",
                "Lead dioxide (PbO2)",
                "Manganese dioxide (MnO2)",
                "Chromium dioxide (CrO2)",
                "Molybdenum dioxide (MoO2)",
                "Tungsten dioxide (WO2)",
                "Rhenium dioxide (ReO2)",
                "Osmium dioxide (OsO2)"
            ],
            "mismatches": [
                "Iridium dioxide (IrO2)",
                "Platinum dioxide (PtO2)",
                "Gold dioxide (AuO2)"
            ],
            "true_referents": [
                "Bromine dioxide (BrO2)",
                "Carbon dioxide (CO2)",
                "Chlorine dioxide (ClO2)",
                "Chromium dioxide (CrO2)",
                "Dinitrogen monoxide (N2O)",
                "Dinitrogen pentoxide (N2O5)",
                "Dinitrogen tetroxide (N2O4)",
                "Germanium dioxide (GeO2)",
                "Iodine dioxide (IO2)",
                "Lead dioxide (PbO2)",
                "Manganese dioxide (MnO2)",
                "Molybdenum dioxide (MoO2)",
                "Nitric oxide (NO)",
                "Nitrogen dioxide (NO2)",
                "Nitrous oxide (N2O)",
                "Osmium dioxide (OsO2)",
                "Ozone (O3)",
                "Rhenium dioxide (ReO2)",
                "Silicon dioxide (SiO2)",
                "Sulfur dioxide (SO2)",
                "Sulfur trioxide (SO3)",
                "Tin dioxide (SnO2)",
                "Tungsten dioxide (WO2)",
                "Xenon oxide (XeO3)"
            ],
            "TP": 24,
            "FP": 3,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "amazon.titan-text-express-v1",
        "concept": "gaseous oxide",
        "domain": "chemistry",
        "response": [
            {
                "Referent": "carbon dioxide",
                "Canonical Name": "CO2"
            },
            {
                "Referent": "carbon monoxide",
                "Canonical Name": "CO"
            },
            {
                "Referent": "nitrogen dioxide",
                "Canonical Name": "NO2"
            },
            {
                "Referent": "sulfur dioxide",
                "Canonical Name": "SO2"
            },
            {
                "Referent": "ammonia",
                "Canonical Name": "NH3"
            },
            {
                "Referent": "hydrogen sulfide",
                "Canonical Name": "H2S"
            },
            {
                "Referent": "nitrous oxide",
                "Canonical Name": "N2O"
            },
            {
                "Referent": "hydrogen peroxide",
                "Canonical Name": "H2O2"
            },
            {
                "Referent": "ozone",
                "Canonical Name": "O3"
            },
            {
                "Referent": "nitrogen trioxide",
                "Canonical Name": "NO3"
            },
            {
                "Referent": "nitrogen pentoxide",
                "Canonical Name": "NO5"
            },
            {
                "Referent": "sulfur trioxide",
                "Canonical Name": "SO3"
            },
            {
                "Referent": "phosphine",
                "Canonical Name": "PH3"
            },
            {
                "Referent": "dimethyl sulfide",
                "Canonical Name": "DMS"
            },
            {
                "Referent": "hydrogen cyanide",
                "Canonical Name": "HCN"
            },
            {
                "Referent": "hydrogen selenide",
                "Canonical Name": "H2Se"
            },
            {
                "Referent": "carbon disulfide",
                "Canonical Name": "CS2"
            },
            {
                "Referent": "dichlorodifluoromethane",
                "Canonical Name": "CHCl2F2"
            },
            {
                "Referent": "chlorine trifluoride",
                "Canonical Name": "ClF3"
            },
            {
                "Referent": "sulfuryl chloride",
                "Canonical Name": "SO2Cl2"
            },
            {
                "Referent": "dichlorodifluoromethane",
                "Canonical Name": "CHCl2F2"
            },
            {
                "Referent": "chlorine trifluoride",
                "Canonical Name": "ClF3"
            },
            {
                "Referent": "sulfuryl chloride",
                "Canonical Name": "SO2Cl2"
            },
            {
                "Referent": "dichlorodifluoromethane",
                "Canonical Name": "CHCl2F2"
            },
            {
                "Referent": "chlorine trifluoride",
                "Canonical Name": "ClF3"
            },
            {
                "Referent": "sulfuryl chloride",
                "Canonical Name": "SO2Cl2"
            },
            {
                "Referent": "dichlorodifluoromethane",
                "Canonical Name": "CHCl2F2"
            },
            {
                "Referent": "chlorine trifluoride",
                "Canonical Name": "ClF3"
            },
            {
                "Referent": "sulfuryl chloride",
                "Canonical Name": "SO2Cl2"
            },
            {
                "Referent": "dichlorodifluoromethane",
                "Canonical Name": "CHCl2F2"
            },
            {
                "Referent": "chlorine trifluoride",
                "Canonical Name": "ClF3"
            },
            {
                "Referent": "sulfuryl chloride",
                "Canonical Name": "SO2Cl2"
            }
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "carbon dioxide",
                "carbon monoxide",
                "nitrogen dioxide",
                "sulfur dioxide",
                "ammonia",
                "hydrogen sulfide",
                "nitrous oxide",
                "hydrogen peroxide",
                "ozone",
                "nitrogen trioxide",
                "nitrogen pentoxide",
                "sulfur trioxide",
                "phosphine",
                "dimethyl sulfide",
                "hydrogen cyanide",
                "hydrogen selenide",
                "carbon disulfide",
                "dichlorodifluoromethane",
                "chlorine trifluoride",
                "sulfuryl chloride"
            ],
            "mismatches": [],
            "true_referents": [
                "{\"Referent\": \"ammonia\", \"Canonical Name\": \"NH3\"}",
                "{\"Referent\": \"carbon dioxide\", \"Canonical Name\": \"CO2\"}",
                "{\"Referent\": \"carbon disulfide\", \"Canonical Name\": \"CS2\"}",
                "{\"Referent\": \"carbon monoxide\", \"Canonical Name\": \"CO\"}",
                "{\"Referent\": \"chlorine trifluoride\", \"Canonical Name\": \"ClF3\"}",
                "{\"Referent\": \"dichlorodifluoromethane\", \"Canonical Name\": \"CHCl2F2\"}",
                "{\"Referent\": \"dimethyl sulfide\", \"Canonical Name\": \"DMS\"}",
                "{\"Referent\": \"hydrogen cyanide\", \"Canonical Name\": \"HCN\"}",
                "{\"Referent\": \"hydrogen peroxide\", \"Canonical Name\": \"H2O2\"}",
                "{\"Referent\": \"hydrogen selenide\", \"Canonical Name\": \"H2Se\"}",
                "{\"Referent\": \"hydrogen sulfide\", \"Canonical Name\": \"H2S\"}",
                "{\"Referent\": \"nitrogen dioxide\", \"Canonical Name\": \"NO2\"}",
                "{\"Referent\": \"nitrogen pentoxide\", \"Canonical Name\": \"NO5\"}",
                "{\"Referent\": \"nitrogen trioxide\", \"Canonical Name\": \"NO3\"}",
                "{\"Referent\": \"nitrous oxide\", \"Canonical Name\": \"N2O\"}",
                "{\"Referent\": \"ozone\", \"Canonical Name\": \"O3\"}",
                "{\"Referent\": \"phosphine\", \"Canonical Name\": \"PH3\"}",
                "{\"Referent\": \"sulfur dioxide\", \"Canonical Name\": \"SO2\"}",
                "{\"Referent\": \"sulfur trioxide\", \"Canonical Name\": \"SO3\"}",
                "{\"Referent\": \"sulfuryl chloride\", \"Canonical Name\": \"SO2Cl2\"}"
            ],
            "TP": 20,
            "FP": 0,
            "FN": 0
        }
    }
]