[
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "qwen-v2.5-14b-instruct",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (THLD)",
            "Lindane (LND)",
            "Isotretinoin (ISRTN)",
            "Warfarin (WARF)",
            "Phenytoin (PHT)",
            "Diethylstilbestrol (DES)",
            "Valproic Acid (VPA)",
            "Ethanol (ETHL)",
            "Lidocaine (LDC)",
            "Metronidazole (MTNZ)",
            "Acetaminophen (APAP)",
            "Chloramphenicol (CHLPM)",
            "Isoniazid (ISON)",
            "Sulfonylureas (SULF)",
            "Mercury (Hg)",
            "Lithium (Li)",
            "Methimazole (MMZ)",
            "Propylthiouracil (PTU)",
            "Tetracycline (TETR)",
            "Doxorubicin (DOXR)",
            "Hydroxyurea (HYU)",
            "Vincristine (VCR)",
            "Tamoxifen (TAMO)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (THLD)",
                "Lindane (LND)",
                "Isotretinoin (ISRTN)",
                "Warfarin (WARF)",
                "Phenytoin (PHT)",
                "Diethylstilbestrol (DES)",
                "Valproic Acid (VPA)",
                "Ethanol (ETHL)",
                "Lidocaine (LDC)",
                "Metronidazole (MTNZ)",
                "Acetaminophen (APAP)",
                "Chloramphenicol (CHLPM)",
                "Isoniazid (ISON)",
                "Sulfonylureas (SULF)",
                "Mercury (Hg)",
                "Lithium (Li)",
                "Methimazole (MMZ)",
                "Propylthiouracil (PTU)",
                "Tetracycline (TETR)",
                "Doxorubicin (DOXR)",
                "Hydroxyurea (HYU)",
                "Vincristine (VCR)",
                "Tamoxifen (TAMO)"
            ],
            "mismatches": [],
            "true_referents": [
                "Acetaminophen (APAP)",
                "Chloramphenicol (CHLPM)",
                "Diethylstilbestrol (DES)",
                "Doxorubicin (DOXR)",
                "Ethanol (ETHL)",
                "Hydroxyurea (HYU)",
                "Isoniazid (ISON)",
                "Isotretinoin (ISRTN)",
                "Lidocaine (LDC)",
                "Lindane (LND)",
                "Lithium (Li)",
                "Mercury (Hg)",
                "Methimazole (MMZ)",
                "Metronidazole (MTNZ)",
                "Phenytoin (PHT)",
                "Propylthiouracil (PTU)",
                "Sulfonylureas (SULF)",
                "Tamoxifen (TAMO)",
                "Tetracycline (TETR)",
                "Thalidomide (THLD)",
                "Valproic Acid (VPA)",
                "Vincristine (VCR)",
                "Warfarin (WARF)"
            ],
            "TP": 23,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "phi-v4",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (C13H10N2O4)",
            "Isotretinoin (C20H28O2)",
            "Valproic acid (C8H16O2)",
            "Warfarin (C19H16O4)",
            "Methotrexate (C20H22N8O5)",
            "Alcohol (ethanol, C2H5OH)",
            "Cocaine (C17H21NO4)",
            "Heroin (C21H23NO5)",
            "Phenytoin (C15H12N2O2)",
            "Lithium (Li)",
            "Methimazole (C4H4N2S)",
            "Cytotoxic chemotherapy agents",
            "Retinoic acid (tretinoin, C20H28O2)",
            "Vitamin A (retinol, C20H30O)",
            "Misoprostol (C22H38O5)",
            "Tetracycline (C22H24N2O8)",
            "Cocaine analogs",
            "Androgenic steroids",
            "Amphetamines (C9H13N)",
            "Benzodiazepines (various structures)",
            "Phencyclidine (PCP, C17H25N)",
            "Cannabinoids (various structures)",
            "Anticonvulsants (various structures)",
            "ACE inhibitors (various structures)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (C13H10N2O4)",
                "Isotretinoin (C20H28O2)",
                "Valproic acid (C8H16O2)",
                "Warfarin (C19H16O4)",
                "Methotrexate (C20H22N8O5)",
                "Alcohol (ethanol, C2H5OH)",
                "Cocaine (C17H21NO4)",
                "Heroin (C21H23NO5)",
                "Phenytoin (C15H12N2O2)",
                "Lithium (Li)",
                "Methimazole (C4H4N2S)",
                "Cytotoxic chemotherapy agents",
                "Retinoic acid (tretinoin, C20H28O2)",
                "Vitamin A (retinol, C20H30O)",
                "Misoprostol (C22H38O5)",
                "Tetracycline (C22H24N2O8)",
                "Cocaine analogs",
                "Androgenic steroids",
                "Amphetamines (C9H13N)",
                "Benzodiazepines (various structures)",
                "Phencyclidine (PCP, C17H25N)",
                "Cannabinoids (various structures)",
                "Anticonvulsants (various structures)",
                "ACE inhibitors (various structures)"
            ],
            "mismatches": [],
            "true_referents": [
                "ACE inhibitors (various structures)",
                "Alcohol (ethanol, C2H5OH)",
                "Amphetamines (C9H13N)",
                "Androgenic steroids",
                "Anticonvulsants (various structures)",
                "Benzodiazepines (various structures)",
                "Cannabinoids (various structures)",
                "Cocaine (C17H21NO4)",
                "Cocaine analogs",
                "Cytotoxic chemotherapy agents",
                "Heroin (C21H23NO5)",
                "Isotretinoin (C20H28O2)",
                "Lithium (Li)",
                "Methimazole (C4H4N2S)",
                "Methotrexate (C20H22N8O5)",
                "Misoprostol (C22H38O5)",
                "Phencyclidine (PCP, C17H25N)",
                "Phenytoin (C15H12N2O2)",
                "Retinoic acid (tretinoin, C20H28O2)",
                "Tetracycline (C22H24N2O8)",
                "Thalidomide (C13H10N2O4)",
                "Valproic acid (C8H16O2)",
                "Vitamin A (retinol, C20H30O)",
                "Warfarin (C19H16O4)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "mistral-small-instruct-24B",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (Thalidomide)",
            "Valproic Acid (Valproic Acid)",
            "Accutane (Isotretinoin)",
            "Methylmercury",
            "Warfarin",
            "Diethylstilbestrol (DES)",
            "Cyclophosphamide",
            "Methotrexate",
            "Phenytoin",
            "Carbamazepine",
            "Ethanol",
            "Retinoic Acid",
            "Benzodiazepines",
            "Lithium",
            "Amiodarone",
            "Phenobarbital",
            "Trimethadione",
            "Phenytoin",
            "Primidone",
            "Lamotrigine",
            "Topiramate",
            "Oxycodone",
            "Fluconazole",
            "Ibuprofen"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (Thalidomide)",
                "Valproic Acid (Valproic Acid)",
                "Accutane (Isotretinoin)",
                "Methylmercury",
                "Warfarin",
                "Diethylstilbestrol (DES)",
                "Cyclophosphamide",
                "Methotrexate",
                "Phenytoin",
                "Carbamazepine",
                "Ethanol",
                "Retinoic Acid",
                "Benzodiazepines",
                "Lithium",
                "Amiodarone",
                "Phenobarbital",
                "Trimethadione",
                "Primidone",
                "Lamotrigine",
                "Topiramate",
                "Oxycodone",
                "Fluconazole",
                "Ibuprofen"
            ],
            "mismatches": [],
            "true_referents": [
                "Accutane (Isotretinoin)",
                "Amiodarone",
                "Benzodiazepines",
                "Carbamazepine",
                "Cyclophosphamide",
                "Diethylstilbestrol (DES)",
                "Ethanol",
                "Fluconazole",
                "Ibuprofen",
                "Lamotrigine",
                "Lithium",
                "Methotrexate",
                "Methylmercury",
                "Oxycodone",
                "Phenobarbital",
                "Phenytoin",
                "Primidone",
                "Retinoic Acid",
                "Thalidomide (Thalidomide)",
                "Topiramate",
                "Trimethadione",
                "Valproic Acid (Valproic Acid)",
                "Warfarin"
            ],
            "TP": 23,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "gemma-3",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (THAL)",
            "Diethylstilbestrol (DES)",
            "Methylmercury (MeHg)",
            "Valproic Acid (VPA)",
            "Retinoic Acid (RA)",
            "Isotretinoin (ISO)",
            "Warfarin (WARF)",
            "Phenytoin (PHYT)",
            "Lithium Carbonate (Li2CO3)",
            "Tetracycline (TET)",
            "ACE Inhibitors (ACEI)",
            "Angiotensin II Receptor Blockers (ARBs)",
            "Methotrexate (MTX)",
            "Misoprostol (MISO)",
            "Ribavirin (RIB)",
            "Ethanol (EtOH)",
            "Phenobarbital (PB)",
            "Carbamazepine (CBZ)",
            "Topiramate (TOP)",
            "Trimethadione (TRI)",
            "Paracetamol (PAR)",
            "Ibuprofen (IBU)",
            "Naproxen (NAP)",
            "Pesticides (PEST)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (THAL)",
                "Diethylstilbestrol (DES)",
                "Methylmercury (MeHg)",
                "Valproic Acid (VPA)",
                "Retinoic Acid (RA)",
                "Isotretinoin (ISO)",
                "Warfarin (WARF)",
                "Phenytoin (PHYT)",
                "Lithium Carbonate (Li2CO3)",
                "Tetracycline (TET)",
                "ACE Inhibitors (ACEI)",
                "Angiotensin II Receptor Blockers (ARBs)",
                "Methotrexate (MTX)",
                "Misoprostol (MISO)",
                "Ribavirin (RIB)",
                "Ethanol (EtOH)",
                "Phenobarbital (PB)",
                "Carbamazepine (CBZ)",
                "Topiramate (TOP)",
                "Trimethadione (TRI)",
                "Paracetamol (PAR)",
                "Ibuprofen (IBU)",
                "Naproxen (NAP)",
                "Pesticides (PEST)"
            ],
            "mismatches": [],
            "true_referents": [
                "ACE Inhibitors (ACEI)",
                "Angiotensin II Receptor Blockers (ARBs)",
                "Carbamazepine (CBZ)",
                "Diethylstilbestrol (DES)",
                "Ethanol (EtOH)",
                "Ibuprofen (IBU)",
                "Isotretinoin (ISO)",
                "Lithium Carbonate (Li2CO3)",
                "Methotrexate (MTX)",
                "Methylmercury (MeHg)",
                "Misoprostol (MISO)",
                "Naproxen (NAP)",
                "Paracetamol (PAR)",
                "Pesticides (PEST)",
                "Phenobarbital (PB)",
                "Phenytoin (PHYT)",
                "Retinoic Acid (RA)",
                "Ribavirin (RIB)",
                "Tetracycline (TET)",
                "Thalidomide (THAL)",
                "Topiramate (TOP)",
                "Trimethadione (TRI)",
                "Valproic Acid (VPA)",
                "Warfarin (WARF)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "gpt-4o",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (C13H10N2O4)",
            "Isotretinoin (C20H28O2)",
            "Diethylstilbestrol (DES)",
            "Valproic Acid (C8H16O2)",
            "Warfarin (C19H16O4)",
            "Methotrexate (C20H22N8O5)",
            "Cyclophosphamide (C7H15Cl2N2O2P)",
            "Carbamazepine (C15H12N2O)",
            "Phenytoin (C15H12N2O2)",
            "Lithium Carbonate (Li2CO3)",
            "Misoprostol (C22H38O5)",
            "Tetracycline (C22H24N2O8)",
            "Aminopterin (C19H20N8O5)",
            "Mycophenolate Mofetil (C23H31NO7)",
            "Leflunomide (C12H9F3N2O2)",
            "Bosentan (C27H29N5O6S)",
            "Fluconazole (C13H12F2N6O)",
            "Danazol (C22H27NO2)",
            "Methimazole (C4H6N2S)",
            "Penicillamine (C5H11NO2S)",
            "Ribavirin (C8H12N4O5)",
            "Topiramate (C12H21NO8S)",
            "Trimethadione (C6H9NO3)",
            "Ergotamine (C33H35N5O5)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (C13H10N2O4)",
                "Isotretinoin (C20H28O2)",
                "Diethylstilbestrol (DES)",
                "Valproic Acid (C8H16O2)",
                "Warfarin (C19H16O4)",
                "Methotrexate (C20H22N8O5)",
                "Cyclophosphamide (C7H15Cl2N2O2P)",
                "Carbamazepine (C15H12N2O)",
                "Phenytoin (C15H12N2O2)",
                "Lithium Carbonate (Li2CO3)",
                "Misoprostol (C22H38O5)",
                "Tetracycline (C22H24N2O8)",
                "Aminopterin (C19H20N8O5)",
                "Mycophenolate Mofetil (C23H31NO7)",
                "Leflunomide (C12H9F3N2O2)",
                "Bosentan (C27H29N5O6S)",
                "Fluconazole (C13H12F2N6O)",
                "Danazol (C22H27NO2)",
                "Methimazole (C4H6N2S)",
                "Penicillamine (C5H11NO2S)",
                "Ribavirin (C8H12N4O5)",
                "Topiramate (C12H21NO8S)",
                "Trimethadione (C6H9NO3)",
                "Ergotamine (C33H35N5O5)"
            ],
            "mismatches": [],
            "true_referents": [
                "Aminopterin (C19H20N8O5)",
                "Bosentan (C27H29N5O6S)",
                "Carbamazepine (C15H12N2O)",
                "Cyclophosphamide (C7H15Cl2N2O2P)",
                "Danazol (C22H27NO2)",
                "Diethylstilbestrol (DES)",
                "Ergotamine (C33H35N5O5)",
                "Fluconazole (C13H12F2N6O)",
                "Isotretinoin (C20H28O2)",
                "Leflunomide (C12H9F3N2O2)",
                "Lithium Carbonate (Li2CO3)",
                "Methimazole (C4H6N2S)",
                "Methotrexate (C20H22N8O5)",
                "Misoprostol (C22H38O5)",
                "Mycophenolate Mofetil (C23H31NO7)",
                "Penicillamine (C5H11NO2S)",
                "Phenytoin (C15H12N2O2)",
                "Ribavirin (C8H12N4O5)",
                "Tetracycline (C22H24N2O8)",
                "Thalidomide (C13H10N2O4)",
                "Topiramate (C12H21NO8S)",
                "Trimethadione (C6H9NO3)",
                "Valproic Acid (C8H16O2)",
                "Warfarin (C19H16O4)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "gpt-4o-mini",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (THAL)",
            "Isotretinoin (ISOT)",
            "Valproic acid (VALP)",
            "Methotrexate (METH)",
            "ACE inhibitors (ACEI)",
            "Tetracycline (TET)",
            "Diethylstilbestrol (DES)",
            "Lithium (LITH)",
            "Warfarin (WARF)",
            "Alcohol (ETOH)",
            "Cytomegalovirus (CMV)",
            "Rubella virus (RUB)",
            "Zika virus (ZIK)",
            "Sodium valproate (SODV)",
            "Carbamazepine (CARB)",
            "Cyclophosphamide (CYCLO)",
            "Phenobarbital (PHENO)",
            "Cocaine (COCA)",
            "Retinoic acid (RETAC)",
            "Doxycycline (DOXY)",
            "Aminopterin (AMIN)",
            "Trimethoprim (TRIM)",
            "Misoprostol (MISO)",
            "Fluconazole (FLUCO)",
            "Benzodiazepines (BENZ)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (THAL)",
                "Isotretinoin (ISOT)",
                "Valproic acid (VALP)",
                "Methotrexate (METH)",
                "ACE inhibitors (ACEI)",
                "Tetracycline (TET)",
                "Diethylstilbestrol (DES)",
                "Lithium (LITH)",
                "Warfarin (WARF)",
                "Alcohol (ETOH)",
                "Cytomegalovirus (CMV)",
                "Rubella virus (RUB)",
                "Zika virus (ZIK)",
                "Sodium valproate (SODV)",
                "Carbamazepine (CARB)",
                "Cyclophosphamide (CYCLO)",
                "Phenobarbital (PHENO)",
                "Cocaine (COCA)",
                "Retinoic acid (RETAC)",
                "Doxycycline (DOXY)",
                "Aminopterin (AMIN)",
                "Trimethoprim (TRIM)",
                "Misoprostol (MISO)",
                "Fluconazole (FLUCO)"
            ],
            "mismatches": [
                "Benzodiazepines (BENZ)"
            ],
            "true_referents": [
                "ACE inhibitors (ACEI)",
                "Alcohol (ETOH)",
                "Aminopterin (AMIN)",
                "Carbamazepine (CARB)",
                "Cocaine (COCA)",
                "Cyclophosphamide (CYCLO)",
                "Cytomegalovirus (CMV)",
                "Diethylstilbestrol (DES)",
                "Doxycycline (DOXY)",
                "Fluconazole (FLUCO)",
                "Isotretinoin (ISOT)",
                "Lithium (LITH)",
                "Methotrexate (METH)",
                "Misoprostol (MISO)",
                "Phenobarbital (PHENO)",
                "Retinoic acid (RETAC)",
                "Rubella virus (RUB)",
                "Sodium valproate (SODV)",
                "Tetracycline (TET)",
                "Thalidomide (THAL)",
                "Trimethoprim (TRIM)",
                "Valproic acid (VALP)",
                "Warfarin (WARF)",
                "Zika virus (ZIK)"
            ],
            "TP": 24,
            "FP": 1,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "gpt-4-32k",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (C13H10N2O4)",
            "Isotretinoin (C20H28O2)",
            "Warfarin (C19H16O4)",
            "Phenytoin (C15H12N2O2)",
            "Valproic Acid (C8H16O2)",
            "Methotrexate (C20H22N8O5)",
            "Tetracycline (C22H24N2O8)",
            "Streptomycin (C21H39N7O12)",
            "Chloramphenicol (C11H12Cl2N2O5)",
            "Griseofulvin (C17H17ClO6)",
            "Cyclophosphamide (C7H15Cl2N2O2P)",
            "Busulfan (C6H14O6S2)",
            "Ethanol (C2H6O)",
            "Nicotine (C10H14N2)",
            "Cocaine (C17H21NO4)",
            "Heroin (C21H23NO5)",
            "Methamphetamine (C10H15N)",
            "Lithium (Li)",
            "Lead (Pb)",
            "Mercury (Hg)",
            "Arsenic (As)",
            "Cadmium (Cd)",
            "Polychlorinated Biphenyls (PCBs)",
            "Dioxin (C12H4Cl4O)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (C13H10N2O4)",
                "Isotretinoin (C20H28O2)",
                "Warfarin (C19H16O4)",
                "Phenytoin (C15H12N2O2)",
                "Valproic Acid (C8H16O2)",
                "Methotrexate (C20H22N8O5)",
                "Tetracycline (C22H24N2O8)",
                "Streptomycin (C21H39N7O12)",
                "Chloramphenicol (C11H12Cl2N2O5)",
                "Griseofulvin (C17H17ClO6)",
                "Cyclophosphamide (C7H15Cl2N2O2P)",
                "Busulfan (C6H14O6S2)",
                "Ethanol (C2H6O)",
                "Nicotine (C10H14N2)",
                "Cocaine (C17H21NO4)",
                "Heroin (C21H23NO5)",
                "Methamphetamine (C10H15N)",
                "Lithium (Li)",
                "Lead (Pb)",
                "Mercury (Hg)",
                "Arsenic (As)",
                "Cadmium (Cd)",
                "Polychlorinated Biphenyls (PCBs)",
                "Dioxin (C12H4Cl4O)"
            ],
            "mismatches": [],
            "true_referents": [
                "Arsenic (As)",
                "Busulfan (C6H14O6S2)",
                "Cadmium (Cd)",
                "Chloramphenicol (C11H12Cl2N2O5)",
                "Cocaine (C17H21NO4)",
                "Cyclophosphamide (C7H15Cl2N2O2P)",
                "Dioxin (C12H4Cl4O)",
                "Ethanol (C2H6O)",
                "Griseofulvin (C17H17ClO6)",
                "Heroin (C21H23NO5)",
                "Isotretinoin (C20H28O2)",
                "Lead (Pb)",
                "Lithium (Li)",
                "Mercury (Hg)",
                "Methamphetamine (C10H15N)",
                "Methotrexate (C20H22N8O5)",
                "Nicotine (C10H14N2)",
                "Phenytoin (C15H12N2O2)",
                "Polychlorinated Biphenyls (PCBs)",
                "Streptomycin (C21H39N7O12)",
                "Tetracycline (C22H24N2O8)",
                "Thalidomide (C13H10N2O4)",
                "Valproic Acid (C8H16O2)",
                "Warfarin (C19H16O4)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "gpt-35-turbo-16k",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide",
            "Isotretinoin",
            "Valproic acid",
            "Methotrexate",
            "Warfarin",
            "Lithium",
            "Retinoids",
            "Tetracycline",
            "ACE inhibitors",
            "Misoprostol",
            "Methimazole",
            "Carbamazepine",
            "Phenytoin",
            "Trimethadione",
            "Androgens",
            "Diethylstilbestrol",
            "Cyclophosphamide",
            "Chloramphenicol",
            "Streptomycin",
            "Ethanol",
            "Cocaine",
            "Caffeine",
            "Lead",
            "Mercury"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide",
                "Isotretinoin",
                "Valproic acid",
                "Methotrexate",
                "Warfarin",
                "Lithium",
                "Retinoids",
                "Tetracycline",
                "ACE inhibitors",
                "Misoprostol",
                "Methimazole",
                "Carbamazepine",
                "Phenytoin",
                "Trimethadione",
                "Androgens",
                "Diethylstilbestrol",
                "Cyclophosphamide",
                "Chloramphenicol",
                "Streptomycin",
                "Ethanol",
                "Cocaine",
                "Caffeine",
                "Lead",
                "Mercury"
            ],
            "mismatches": [],
            "true_referents": [
                "ACE inhibitors",
                "Androgens",
                "Caffeine",
                "Carbamazepine",
                "Chloramphenicol",
                "Cocaine",
                "Cyclophosphamide",
                "Diethylstilbestrol",
                "Ethanol",
                "Isotretinoin",
                "Lead",
                "Lithium",
                "Mercury",
                "Methimazole",
                "Methotrexate",
                "Misoprostol",
                "Phenytoin",
                "Retinoids",
                "Streptomycin",
                "Tetracycline",
                "Thalidomide",
                "Trimethadione",
                "Valproic acid",
                "Warfarin"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "o1-mini",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (THD)",
            "Isotretinoin (13-cis-Retinoic acid)",
            "Ethanol (Alcohol)",
            "Valproic acid (VPA)",
            "Methotrexate (MTX)",
            "Diethylstilbestrol (DES)",
            "Retinoic acid",
            "Lead acetate",
            "Mercury chloride",
            "Arsenic trioxide",
            "Cocaine",
            "Phenytoin",
            "Warfarin",
            "Tetraethyllead",
            "Cyclophosphamide",
            "Benzo[a]pyrene",
            "Cyanide",
            "Ethylene glycol",
            "Lithium carbonate",
            "Thallium sulfate",
            "Trimethoprim",
            "Tretinoin (All-trans retinoic acid)",
            "Acrylamide",
            "Nitrofen"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (THD)",
                "Isotretinoin (13-cis-Retinoic acid)",
                "Ethanol (Alcohol)",
                "Valproic acid (VPA)",
                "Methotrexate (MTX)",
                "Diethylstilbestrol (DES)",
                "Retinoic acid",
                "Lead acetate",
                "Mercury chloride",
                "Arsenic trioxide",
                "Cocaine",
                "Phenytoin",
                "Warfarin",
                "Tetraethyllead",
                "Cyclophosphamide",
                "Benzo[a]pyrene",
                "Cyanide",
                "Ethylene glycol",
                "Lithium carbonate",
                "Thallium sulfate",
                "Trimethoprim",
                "Tretinoin (All-trans retinoic acid)",
                "Acrylamide",
                "Nitrofen"
            ],
            "mismatches": [],
            "true_referents": [
                "Acrylamide",
                "Arsenic trioxide",
                "Benzo[a]pyrene",
                "Cocaine",
                "Cyanide",
                "Cyclophosphamide",
                "Diethylstilbestrol (DES)",
                "Ethanol (Alcohol)",
                "Ethylene glycol",
                "Isotretinoin (13-cis-Retinoic acid)",
                "Lead acetate",
                "Lithium carbonate",
                "Mercury chloride",
                "Methotrexate (MTX)",
                "Nitrofen",
                "Phenytoin",
                "Retinoic acid",
                "Tetraethyllead",
                "Thalidomide (THD)",
                "Thallium sulfate",
                "Tretinoin (All-trans retinoic acid)",
                "Trimethoprim",
                "Valproic acid (VPA)",
                "Warfarin"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "anthropic.claude-3-5-sonnet-v1:0",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (THA)",
            "Isotretinoin (13-cis-retinoic acid)",
            "Valproic acid (VPA)",
            "Warfarin",
            "Ethanol",
            "Methylmercury",
            "Diethylstilbestrol (DES)",
            "Phenytoin",
            "Carbamazepine",
            "Lithium",
            "Misoprostol",
            "Methotrexate",
            "Tetracycline",
            "Angiotensin-converting enzyme (ACE) inhibitors",
            "Mycophenolate mofetil",
            "Fluconazole",
            "Cyclophosphamide",
            "Ribavirin",
            "Aminopterin",
            "Diethyltoluamide (DEET)",
            "Methimazole",
            "Danazol",
            "Cocaine",
            "Lead"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (THA)",
                "Isotretinoin (13-cis-retinoic acid)",
                "Valproic acid (VPA)",
                "Warfarin",
                "Ethanol",
                "Methylmercury",
                "Diethylstilbestrol (DES)",
                "Phenytoin",
                "Carbamazepine",
                "Lithium",
                "Misoprostol",
                "Methotrexate",
                "Tetracycline",
                "Angiotensin-converting enzyme (ACE) inhibitors",
                "Mycophenolate mofetil",
                "Fluconazole",
                "Cyclophosphamide",
                "Ribavirin",
                "Aminopterin",
                "Diethyltoluamide (DEET)",
                "Methimazole",
                "Danazol",
                "Cocaine",
                "Lead"
            ],
            "mismatches": [],
            "true_referents": [
                "Aminopterin",
                "Angiotensin-converting enzyme (ACE) inhibitors",
                "Carbamazepine",
                "Cocaine",
                "Cyclophosphamide",
                "Danazol",
                "Diethylstilbestrol (DES)",
                "Diethyltoluamide (DEET)",
                "Ethanol",
                "Fluconazole",
                "Isotretinoin (13-cis-retinoic acid)",
                "Lead",
                "Lithium",
                "Methimazole",
                "Methotrexate",
                "Methylmercury",
                "Misoprostol",
                "Mycophenolate mofetil",
                "Phenytoin",
                "Ribavirin",
                "Tetracycline",
                "Thalidomide (THA)",
                "Valproic acid (VPA)",
                "Warfarin"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "anthropic.claude-3-opus-v1:0",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide",
            "Valproic acid",
            "Isotretinoin (Accutane)",
            "Diethylstilbestrol (DES)",
            "Warfarin",
            "Lithium",
            "Phenytoin",
            "Trimethadione",
            "Aminopterin",
            "Methotrexate",
            "Fluconazole",
            "Tetracycline",
            "Streptomycin",
            "Kanamycin",
            "Chloroquine",
            "Quinine",
            "Misoprostol",
            "Mycophenolate mofetil",
            "Cyclophosphamide",
            "Methimazole",
            "Propylthiouracil",
            "Penicillamine",
            "Etretinate",
            "Triparanol"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide",
                "Valproic acid",
                "Isotretinoin (Accutane)",
                "Diethylstilbestrol (DES)",
                "Warfarin",
                "Lithium",
                "Phenytoin",
                "Trimethadione",
                "Aminopterin",
                "Methotrexate",
                "Fluconazole",
                "Tetracycline",
                "Streptomycin",
                "Kanamycin",
                "Chloroquine",
                "Quinine",
                "Misoprostol",
                "Mycophenolate mofetil",
                "Cyclophosphamide",
                "Methimazole",
                "Propylthiouracil",
                "Penicillamine",
                "Etretinate",
                "Triparanol"
            ],
            "mismatches": [],
            "true_referents": [
                "Aminopterin",
                "Chloroquine",
                "Cyclophosphamide",
                "Diethylstilbestrol (DES)",
                "Etretinate",
                "Fluconazole",
                "Isotretinoin (Accutane)",
                "Kanamycin",
                "Lithium",
                "Methimazole",
                "Methotrexate",
                "Misoprostol",
                "Mycophenolate mofetil",
                "Penicillamine",
                "Phenytoin",
                "Propylthiouracil",
                "Quinine",
                "Streptomycin",
                "Tetracycline",
                "Thalidomide",
                "Trimethadione",
                "Triparanol",
                "Valproic acid",
                "Warfarin"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "anthropic.claude-3-sonnet-v1:0",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (C13H10N2O4)",
            "Isotretinoin (C20H28O2)",
            "Valproic acid (C8H16O2)",
            "Methotrexate (C20H22N8O5)",
            "Alcohol (C2H6O)",
            "Warfarin (C19H16O4)",
            "Lithium (Li)",
            "Angiotensin-converting enzyme (ACE) inhibitors",
            "Tetracycline antibiotics",
            "Diethylstilbestrol (C18H20O2)",
            "Carbamazepine (C15H12N2O)",
            "Phenytoin (C15H12N2O2)",
            "Leflunomide (C12H9F3N2O2)",
            "Misoprostol (C22H38O5)",
            "Paroxetine (C19H20FNO3)",
            "Statins",
            "Streptomycin",
            "Aminoglycosides",
            "Coumarin anticoagulants",
            "Retinoids",
            "Antiepileptic drugs",
            "Antidepressants",
            "Angiotensin II receptor blockers (ARBs)",
            "Anticonvulsants"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (C13H10N2O4)",
                "Isotretinoin (C20H28O2)",
                "Valproic acid (C8H16O2)",
                "Methotrexate (C20H22N8O5)",
                "Alcohol (C2H6O)",
                "Warfarin (C19H16O4)",
                "Lithium (Li)",
                "Angiotensin-converting enzyme (ACE) inhibitors",
                "Tetracycline antibiotics",
                "Diethylstilbestrol (C18H20O2)",
                "Carbamazepine (C15H12N2O)",
                "Phenytoin (C15H12N2O2)",
                "Leflunomide (C12H9F3N2O2)",
                "Misoprostol (C22H38O5)",
                "Paroxetine (C19H20FNO3)",
                "Statins",
                "Streptomycin",
                "Aminoglycosides",
                "Coumarin anticoagulants",
                "Retinoids",
                "Antiepileptic drugs",
                "Antidepressants",
                "Angiotensin II receptor blockers (ARBs)",
                "Anticonvulsants"
            ],
            "mismatches": [],
            "true_referents": [
                "Alcohol (C2H6O)",
                "Aminoglycosides",
                "Angiotensin II receptor blockers (ARBs)",
                "Angiotensin-converting enzyme (ACE) inhibitors",
                "Anticonvulsants",
                "Antidepressants",
                "Antiepileptic drugs",
                "Carbamazepine (C15H12N2O)",
                "Coumarin anticoagulants",
                "Diethylstilbestrol (C18H20O2)",
                "Isotretinoin (C20H28O2)",
                "Leflunomide (C12H9F3N2O2)",
                "Lithium (Li)",
                "Methotrexate (C20H22N8O5)",
                "Misoprostol (C22H38O5)",
                "Paroxetine (C19H20FNO3)",
                "Phenytoin (C15H12N2O2)",
                "Retinoids",
                "Statins",
                "Streptomycin",
                "Tetracycline antibiotics",
                "Thalidomide (C13H10N2O4)",
                "Valproic acid (C8H16O2)",
                "Warfarin (C19H16O4)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "anthropic.claude-3-haiku-v1:0",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (THAL)",
            "Valproic Acid (VPA)",
            "Isotretinoin (ITRE)",
            "Methotrexate (MTX)",
            "Warfarin (WARF)",
            "Misoprostol (MISO)",
            "Diethylstilbestrol (DES)",
            "Phenytoin (PHT)",
            "Carbamazepine (CBZ)",
            "Paroxetine (PAROX)",
            "Fluoxetine (FLUOX)",
            "Sertraline (SERT)",
            "Venlafaxine (VENL)",
            "Lithium (LI)",
            "Sodium Valproate (VPA-Na)",
            "Lamotrigine (LTG)",
            "Topiramate (TPM)",
            "Levetiracetam (LEV)",
            "Zonisamide (ZNS)",
            "Gabapentin (GBP)",
            "Pregabalin (PGB)",
            "Divalproex Sodium (DVLX)",
            "Oxcarbazepine (OXC)",
            "Bupropion (BUP)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (THAL)",
                "Valproic Acid (VPA)",
                "Isotretinoin (ITRE)",
                "Methotrexate (MTX)",
                "Warfarin (WARF)",
                "Misoprostol (MISO)",
                "Diethylstilbestrol (DES)",
                "Phenytoin (PHT)",
                "Carbamazepine (CBZ)",
                "Paroxetine (PAROX)",
                "Fluoxetine (FLUOX)",
                "Sertraline (SERT)",
                "Venlafaxine (VENL)",
                "Lithium (LI)",
                "Sodium Valproate (VPA-Na)",
                "Lamotrigine (LTG)",
                "Topiramate (TPM)",
                "Levetiracetam (LEV)",
                "Zonisamide (ZNS)",
                "Gabapentin (GBP)",
                "Pregabalin (PGB)",
                "Divalproex Sodium (DVLX)",
                "Oxcarbazepine (OXC)",
                "Bupropion (BUP)"
            ],
            "mismatches": [],
            "true_referents": [
                "Bupropion (BUP)",
                "Carbamazepine (CBZ)",
                "Diethylstilbestrol (DES)",
                "Divalproex Sodium (DVLX)",
                "Fluoxetine (FLUOX)",
                "Gabapentin (GBP)",
                "Isotretinoin (ITRE)",
                "Lamotrigine (LTG)",
                "Levetiracetam (LEV)",
                "Lithium (LI)",
                "Methotrexate (MTX)",
                "Misoprostol (MISO)",
                "Oxcarbazepine (OXC)",
                "Paroxetine (PAROX)",
                "Phenytoin (PHT)",
                "Pregabalin (PGB)",
                "Sertraline (SERT)",
                "Sodium Valproate (VPA-Na)",
                "Thalidomide (THAL)",
                "Topiramate (TPM)",
                "Valproic Acid (VPA)",
                "Venlafaxine (VENL)",
                "Warfarin (WARF)",
                "Zonisamide (ZNS)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "meta.llama3-8b-instruct-v1:0",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (Thalidomide)",
            "Valproic acid (Valproate)",
            "Warfarin (Warfarin)",
            "Isotretinoin (Accutane)",
            "Methotrexate (Methotrexate)",
            "Aminopterin (Aminopterin)",
            "Folic acid antagonist (Folic acid antagonist)",
            "Methimazole (Tapazole)",
            "Kava (Kava)",
            "Valproate (Valproate)",
            "Fosphenytoin (Cerebyx)",
            "Phenytoin (Dilantin)",
            "Carbamazepine (Tegretol)",
            "Lamotrigine (Lamictal)",
            "Topiramate (Topamax)",
            "Phenobarbital (Luminal)",
            "Ethosuximide (Zarontin)",
            "Clonazepam (Klonopin)",
            "Gabapentin (Neurontin)",
            "Pregabalin (Lyrica)",
            "Valproate (Depakote)",
            "Ethanol (Ethanol)",
            "Tetracycline (Tetracycline)",
            "Tetracycline antibiotics (Tetracycline antibiotics)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (Thalidomide)",
                "Valproic acid (Valproate)",
                "Warfarin (Warfarin)",
                "Isotretinoin (Accutane)",
                "Methotrexate (Methotrexate)",
                "Aminopterin (Aminopterin)",
                "Folic acid antagonist (Folic acid antagonist)",
                "Methimazole (Tapazole)",
                "Kava (Kava)",
                "Valproate (Valproate)",
                "Fosphenytoin (Cerebyx)",
                "Phenytoin (Dilantin)",
                "Carbamazepine (Tegretol)",
                "Lamotrigine (Lamictal)",
                "Topiramate (Topamax)",
                "Phenobarbital (Luminal)",
                "Ethosuximide (Zarontin)",
                "Clonazepam (Klonopin)",
                "Gabapentin (Neurontin)",
                "Pregabalin (Lyrica)",
                "Valproate (Depakote)",
                "Ethanol (Ethanol)",
                "Tetracycline (Tetracycline)",
                "Tetracycline antibiotics (Tetracycline antibiotics)"
            ],
            "mismatches": [],
            "true_referents": [
                "Aminopterin (Aminopterin)",
                "Carbamazepine (Tegretol)",
                "Clonazepam (Klonopin)",
                "Ethanol (Ethanol)",
                "Ethosuximide (Zarontin)",
                "Folic acid antagonist (Folic acid antagonist)",
                "Fosphenytoin (Cerebyx)",
                "Gabapentin (Neurontin)",
                "Isotretinoin (Accutane)",
                "Kava (Kava)",
                "Lamotrigine (Lamictal)",
                "Methimazole (Tapazole)",
                "Methotrexate (Methotrexate)",
                "Phenobarbital (Luminal)",
                "Phenytoin (Dilantin)",
                "Pregabalin (Lyrica)",
                "Tetracycline (Tetracycline)",
                "Tetracycline antibiotics (Tetracycline antibiotics)",
                "Thalidomide (Thalidomide)",
                "Topiramate (Topamax)",
                "Valproate (Depakote)",
                "Valproate (Valproate)",
                "Valproic acid (Valproate)",
                "Warfarin (Warfarin)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "meta.llama3-70b-instruct-v1:0",
        "concept": "teratogen",
        "domain": "medicine",
        "response": [
            "Thalidomide (TD)",
            "Acrinathrin",
            "Cyclophosphamide (CP)",
            "Methotrexate (MTX)",
            "Isotretinoin (13-cis-RA)",
            "Etretinate (TIG)",
            "Valproic acid (VPA)",
            "Warfarin",
            "Phenytoin (PHT)",
            "Trimethadione (TMO)",
            "Doxycycline",
            "Tetracycline",
            "Fluconazole (FLU)",
            "Vitamin A (retinol)",
            "Diethylstilbestrol (DES)",
            "Cortisol",
            "Busulfan",
            "Chlorambucil",
            "Cyclosporine (CsA)",
            "Azathioprine",
            "Mercaptopurine (6-MP)",
            "Hydantoin"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Thalidomide (TD)",
                "Acrinathrin",
                "Cyclophosphamide (CP)",
                "Methotrexate (MTX)",
                "Isotretinoin (13-cis-RA)",
                "Etretinate (TIG)",
                "Valproic acid (VPA)",
                "Warfarin",
                "Phenytoin (PHT)",
                "Trimethadione (TMO)",
                "Doxycycline",
                "Tetracycline",
                "Fluconazole (FLU)",
                "Vitamin A (retinol)",
                "Diethylstilbestrol (DES)",
                "Cortisol",
                "Busulfan",
                "Chlorambucil",
                "Cyclosporine (CsA)",
                "Azathioprine",
                "Mercaptopurine (6-MP)",
                "Hydantoin"
            ],
            "mismatches": [],
            "true_referents": [
                "Acrinathrin",
                "Azathioprine",
                "Busulfan",
                "Chlorambucil",
                "Cortisol",
                "Cyclophosphamide (CP)",
                "Cyclosporine (CsA)",
                "Diethylstilbestrol (DES)",
                "Doxycycline",
                "Etretinate (TIG)",
                "Fluconazole (FLU)",
                "Hydantoin",
                "Isotretinoin (13-cis-RA)",
                "Mercaptopurine (6-MP)",
                "Methotrexate (MTX)",
                "Phenytoin (PHT)",
                "Tetracycline",
                "Thalidomide (TD)",
                "Trimethadione (TMO)",
                "Valproic acid (VPA)",
                "Vitamin A (retinol)",
                "Warfarin"
            ],
            "TP": 22,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents-from-selection-criteria",
        "responder": "amazon.titan-text-express-v1",
        "concept": "teratogen",
        "domain": "medicine",
        "response": {
            "1": "arsenic",
            "2": "mercury",
            "3": "lead",
            "4": "polychlorinated biphenyls"
        },
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "arsenic",
                "mercury",
                "lead",
                "polychlorinated biphenyls"
            ],
            "mismatches": [],
            "true_referents": [
                "1: arsenic",
                "2: mercury",
                "3: lead",
                "4: polychlorinated biphenyls"
            ],
            "TP": 4,
            "FP": 0,
            "FN": 0
        }
    }
]