[
    {
        "test": "limited-list-referents",
        "responder": "qwen-v2.5-14b-instruct",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Heroin (Diacetylmorphine)",
            "Morphine",
            "Codeine",
            "Fentanyl",
            "Hydrocodone",
            "Oxycodone",
            "Methadone",
            "Buprenorphine",
            "Meperidine (Demerol)",
            "Tramadol",
            "Pethidine (Fentalin)",
            "Alfentanil",
            "Sufentanil",
            "Remifentanil",
            "Tapentadol",
            "Propoxyphene",
            "Oxymorphone",
            "Hydromorphone (Dilaudid)",
            "Levorphanol",
            "Dextropropoxyphene",
            "Nalbuphine",
            "Pentazocine",
            "Butorphanol",
            "Diphenoxylate"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Heroin (Diacetylmorphine)",
                "Morphine",
                "Codeine",
                "Fentanyl",
                "Hydrocodone",
                "Oxycodone",
                "Methadone",
                "Buprenorphine",
                "Meperidine (Demerol)",
                "Tramadol",
                "Pethidine (Fentalin)",
                "Alfentanil",
                "Sufentanil",
                "Remifentanil",
                "Tapentadol",
                "Propoxyphene",
                "Oxymorphone",
                "Hydromorphone (Dilaudid)",
                "Levorphanol",
                "Dextropropoxyphene",
                "Nalbuphine",
                "Pentazocine",
                "Butorphanol",
                "Diphenoxylate"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil",
                "Buprenorphine",
                "Butorphanol",
                "Codeine",
                "Dextropropoxyphene",
                "Diphenoxylate",
                "Fentanyl",
                "Heroin (Diacetylmorphine)",
                "Hydrocodone",
                "Hydromorphone (Dilaudid)",
                "Levorphanol",
                "Meperidine (Demerol)",
                "Methadone",
                "Morphine",
                "Nalbuphine",
                "Oxycodone",
                "Oxymorphone",
                "Pentazocine",
                "Pethidine (Fentalin)",
                "Propoxyphene",
                "Remifentanil",
                "Sufentanil",
                "Tapentadol",
                "Tramadol"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "phi-v4",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (C17H19NO3)",
            "Codeine (C18H21NO3)",
            "Heroin (C21H23NO5)",
            "Fentanyl (C22H28N2O)",
            "Methadone (C21H27NO)",
            "Hydrocodone (C18H21NO3)",
            "Oxycodone (C18H21NO4)",
            "Buprenorphine (C29H41NO4)",
            "Pentazocine (C22H29NO4)",
            "Hydromorphone (C17H19NO3)",
            "Meperidine (C15H21NO2)",
            "Propoxyphene (C22H29NO2)",
            "Diamorphine (C17H19NO3)",
            "Alfentanil (C21H26N2O)",
            "Remifentanil (C25H33N3O2)",
            "Sufentanil (C22H31N3O2S)",
            "Tramadol (C16H25NO2)",
            "Tapentadol (C19H27NO2)",
            "Nalbuphine (C19H27NO4)",
            "Butorphanol (C18H27NO3)",
            "Naloxone (C19H21NO4)",
            "Naltrexone (C20H23NO4)",
            "Dextromethorphan (C18H25NO)",
            "Diazepam (C16H13ClN2O)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (C17H19NO3)",
                "Codeine (C18H21NO3)",
                "Heroin (C21H23NO5)",
                "Fentanyl (C22H28N2O)",
                "Methadone (C21H27NO)",
                "Hydrocodone (C18H21NO3)",
                "Oxycodone (C18H21NO4)",
                "Buprenorphine (C29H41NO4)",
                "Pentazocine (C22H29NO4)",
                "Hydromorphone (C17H19NO3)",
                "Meperidine (C15H21NO2)",
                "Propoxyphene (C22H29NO2)",
                "Diamorphine (C17H19NO3)",
                "Alfentanil (C21H26N2O)",
                "Remifentanil (C25H33N3O2)",
                "Sufentanil (C22H31N3O2S)",
                "Tramadol (C16H25NO2)",
                "Tapentadol (C19H27NO2)",
                "Nalbuphine (C19H27NO4)",
                "Butorphanol (C18H27NO3)",
                "Naloxone (C19H21NO4)",
                "Naltrexone (C20H23NO4)",
                "Dextromethorphan (C18H25NO)",
                "Diazepam (C16H13ClN2O)"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil (C21H26N2O)",
                "Buprenorphine (C29H41NO4)",
                "Butorphanol (C18H27NO3)",
                "Codeine (C18H21NO3)",
                "Dextromethorphan (C18H25NO)",
                "Diamorphine (C17H19NO3)",
                "Diazepam (C16H13ClN2O)",
                "Fentanyl (C22H28N2O)",
                "Heroin (C21H23NO5)",
                "Hydrocodone (C18H21NO3)",
                "Hydromorphone (C17H19NO3)",
                "Meperidine (C15H21NO2)",
                "Methadone (C21H27NO)",
                "Morphine (C17H19NO3)",
                "Nalbuphine (C19H27NO4)",
                "Naloxone (C19H21NO4)",
                "Naltrexone (C20H23NO4)",
                "Oxycodone (C18H21NO4)",
                "Pentazocine (C22H29NO4)",
                "Propoxyphene (C22H29NO2)",
                "Remifentanil (C25H33N3O2)",
                "Sufentanil (C22H31N3O2S)",
                "Tapentadol (C19H27NO2)",
                "Tramadol (C16H25NO2)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "mistral-small-instruct-24B",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Heroin (HER)",
            "Codeine (COD)",
            "Oxycodone (OXY)",
            "Hydromorphone (HYD)",
            "Fentanyl (FEN)",
            "Methadone (MET)",
            "OxyContin (OC)",
            "Dilaudid (DL)",
            "Vicodin (VIC)",
            "Percocet (PCE)",
            "Demerol (DEM)",
            "Dolophine (DOL)",
            "Percodan (PCD)",
            "Lortab (LOR)",
            "Norco (NOR)",
            "Duragesic (DUR)",
            "Suboxone (SUB)",
            "Dolophine (DOP)",
            "Endocet (END)",
            "Lortab (LOR)",
            "Morphine Sulfate (MS)",
            "Methadose (METD)",
            "Levorphanol (LEV)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Heroin (HER)",
                "Codeine (COD)",
                "Oxycodone (OXY)",
                "Hydromorphone (HYD)",
                "Fentanyl (FEN)",
                "Methadone (MET)",
                "OxyContin (OC)",
                "Dilaudid (DL)",
                "Vicodin (VIC)",
                "Percocet (PCE)",
                "Demerol (DEM)",
                "Dolophine (DOL)",
                "Percodan (PCD)",
                "Lortab (LOR)",
                "Norco (NOR)",
                "Duragesic (DUR)",
                "Suboxone (SUB)",
                "Dolophine (DOP)",
                "Endocet (END)",
                "Morphine Sulfate (MS)",
                "Methadose (METD)",
                "Levorphanol (LEV)"
            ],
            "mismatches": [],
            "true_referents": [
                "Codeine (COD)",
                "Demerol (DEM)",
                "Dilaudid (DL)",
                "Dolophine (DOL)",
                "Dolophine (DOP)",
                "Duragesic (DUR)",
                "Endocet (END)",
                "Fentanyl (FEN)",
                "Heroin (HER)",
                "Hydromorphone (HYD)",
                "Levorphanol (LEV)",
                "Lortab (LOR)",
                "Methadone (MET)",
                "Methadose (METD)",
                "Morphine (MOR)",
                "Morphine Sulfate (MS)",
                "Norco (NOR)",
                "OxyContin (OC)",
                "Oxycodone (OXY)",
                "Percocet (PCE)",
                "Percodan (PCD)",
                "Suboxone (SUB)",
                "Vicodin (VIC)"
            ],
            "TP": 23,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gemma-3",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Codeine (COD)",
            "Oxycodone (OXY)",
            "Hydrocodone (HYD)",
            "Hydromorphone (HYM)",
            "Fentanyl (FEN)",
            "Methadone (MET)",
            "Tramadol (TRA)",
            "Heroin (HER)",
            "Dilaudid (DIL)",
            "Oxymorphone (OXM)",
            "Buprenorphine (BUP)",
            "Tapentadol (TAP)",
            "Pethidine/Meperidine (PET)",
            "Etorphine (ETO)",
            "Diphenoxylate (DPH)",
            "Loperamide (LOP)",
            "Pholcodine (PHO)",
            "Nalmefene (NAL)",
            "Naloxone (NAL)",
            "Pentazocine (PEN)",
            "Levorphanol (LEV)",
            "Desomorphine (DES)",
            "Vinblastine (VBL)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Codeine (COD)",
                "Oxycodone (OXY)",
                "Hydrocodone (HYD)",
                "Hydromorphone (HYM)",
                "Fentanyl (FEN)",
                "Methadone (MET)",
                "Tramadol (TRA)",
                "Heroin (HER)",
                "Dilaudid (DIL)",
                "Oxymorphone (OXM)",
                "Buprenorphine (BUP)",
                "Tapentadol (TAP)",
                "Pethidine/Meperidine (PET)",
                "Etorphine (ETO)",
                "Diphenoxylate (DPH)",
                "Loperamide (LOP)",
                "Pholcodine (PHO)",
                "Nalmefene (NAL)",
                "Naloxone (NAL)",
                "Pentazocine (PEN)",
                "Levorphanol (LEV)",
                "Desomorphine (DES)",
                "Vinblastine (VBL)"
            ],
            "mismatches": [],
            "true_referents": [
                "Buprenorphine (BUP)",
                "Codeine (COD)",
                "Desomorphine (DES)",
                "Dilaudid (DIL)",
                "Diphenoxylate (DPH)",
                "Etorphine (ETO)",
                "Fentanyl (FEN)",
                "Heroin (HER)",
                "Hydrocodone (HYD)",
                "Hydromorphone (HYM)",
                "Levorphanol (LEV)",
                "Loperamide (LOP)",
                "Methadone (MET)",
                "Morphine (MOR)",
                "Nalmefene (NAL)",
                "Naloxone (NAL)",
                "Oxycodone (OXY)",
                "Oxymorphone (OXM)",
                "Pentazocine (PEN)",
                "Pethidine/Meperidine (PET)",
                "Pholcodine (PHO)",
                "Tapentadol (TAP)",
                "Tramadol (TRA)",
                "Vinblastine (VBL)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4o",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Codeine",
            "Heroin (Diacetylmorphine)",
            "Fentanyl",
            "Oxycodone",
            "Hydrocodone",
            "Methadone",
            "Buprenorphine",
            "Hydromorphone",
            "Oxymorphone",
            "Meperidine",
            "Tramadol",
            "Tapentadol",
            "Pentazocine",
            "Butorphanol",
            "Nalbuphine",
            "Levorphanol",
            "Alfentanil",
            "Sufentanil",
            "Remifentanil",
            "Etorphine",
            "Carfentanil",
            "Propoxyphene",
            "Dextropropoxyphene"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Codeine",
                "Heroin (Diacetylmorphine)",
                "Fentanyl",
                "Oxycodone",
                "Hydrocodone",
                "Methadone",
                "Buprenorphine",
                "Hydromorphone",
                "Oxymorphone",
                "Meperidine",
                "Tramadol",
                "Tapentadol",
                "Pentazocine",
                "Butorphanol",
                "Nalbuphine",
                "Levorphanol",
                "Alfentanil",
                "Sufentanil",
                "Remifentanil",
                "Etorphine",
                "Carfentanil",
                "Propoxyphene",
                "Dextropropoxyphene"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil",
                "Buprenorphine",
                "Butorphanol",
                "Carfentanil",
                "Codeine",
                "Dextropropoxyphene",
                "Etorphine",
                "Fentanyl",
                "Heroin (Diacetylmorphine)",
                "Hydrocodone",
                "Hydromorphone",
                "Levorphanol",
                "Meperidine",
                "Methadone",
                "Morphine (MOR)",
                "Nalbuphine",
                "Oxycodone",
                "Oxymorphone",
                "Pentazocine",
                "Propoxyphene",
                "Remifentanil",
                "Sufentanil",
                "Tapentadol",
                "Tramadol"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4o-mini",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Codeine (COD)",
            "Heroin (Diacetylmorphine)",
            "Fentanyl (FEN)",
            "Oxycodone (OXY)",
            "Hydrocodone (HYC)",
            "Methadone (METH)",
            "Buprenorphine (BUP)",
            "Tramadol (TRA)",
            "Meperidine (MEP)",
            "Sufentanil (SUF)",
            "Alfentanil (ALF)",
            "Tapentadol (TAP)",
            "Pentazocine (PEN)",
            "Nalbuphine (NAL)",
            "Butorphanol (BUT)",
            "Dextropropoxyphene (DEX)",
            "Hydromorphone (HYD)",
            "Oxymorphone (OXM)",
            "Loperamide (LOP)",
            "Clonidine (CLON)",
            "Carfentanil (CAR)",
            "Kratom (Mitragyna speciosa)",
            "Dihydrocodeine (DHC)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Codeine (COD)",
                "Heroin (Diacetylmorphine)",
                "Fentanyl (FEN)",
                "Oxycodone (OXY)",
                "Hydrocodone (HYC)",
                "Methadone (METH)",
                "Buprenorphine (BUP)",
                "Tramadol (TRA)",
                "Meperidine (MEP)",
                "Sufentanil (SUF)",
                "Alfentanil (ALF)",
                "Tapentadol (TAP)",
                "Pentazocine (PEN)",
                "Nalbuphine (NAL)",
                "Butorphanol (BUT)",
                "Dextropropoxyphene (DEX)",
                "Hydromorphone (HYD)",
                "Oxymorphone (OXM)",
                "Loperamide (LOP)",
                "Clonidine (CLON)",
                "Carfentanil (CAR)",
                "Kratom (Mitragyna speciosa)",
                "Dihydrocodeine (DHC)"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil (ALF)",
                "Buprenorphine (BUP)",
                "Butorphanol (BUT)",
                "Carfentanil (CAR)",
                "Clonidine (CLON)",
                "Codeine (COD)",
                "Dextropropoxyphene (DEX)",
                "Dihydrocodeine (DHC)",
                "Fentanyl (FEN)",
                "Heroin (Diacetylmorphine)",
                "Hydrocodone (HYC)",
                "Hydromorphone (HYD)",
                "Kratom (Mitragyna speciosa)",
                "Loperamide (LOP)",
                "Meperidine (MEP)",
                "Methadone (METH)",
                "Morphine (MOR)",
                "Nalbuphine (NAL)",
                "Oxycodone (OXY)",
                "Oxymorphone (OXM)",
                "Pentazocine (PEN)",
                "Sufentanil (SUF)",
                "Tapentadol (TAP)",
                "Tramadol (TRA)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4-32k",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Heroin (HER)",
            "Codeine (COD)",
            "Oxycodone (OXY)",
            "Hydrocodone (HYD)",
            "Fentanyl (FEN)",
            "Methadone (MET)",
            "Meperidine (MEP)",
            "Hydromorphone (HYM)",
            "Oxymorphone (OXM)",
            "Buprenorphine (BUP)",
            "Tramadol (TRA)",
            "Carfentanil (CAR)",
            "Pethidine (PET)",
            "Diamorphine (DIA)",
            "Levorphanol (LEV)",
            "Butorphanol (BUT)",
            "Nalbuphine (NAL)",
            "Pentazocine (PEN)",
            "Sufentanil (SUF)",
            "Remifentanil (REM)",
            "Alfentanil (ALF)",
            "Methorphan (MEO)",
            "Propoxyphene (PRO)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Heroin (HER)",
                "Codeine (COD)",
                "Oxycodone (OXY)",
                "Hydrocodone (HYD)",
                "Fentanyl (FEN)",
                "Methadone (MET)",
                "Meperidine (MEP)",
                "Hydromorphone (HYM)",
                "Oxymorphone (OXM)",
                "Buprenorphine (BUP)",
                "Tramadol (TRA)",
                "Carfentanil (CAR)",
                "Pethidine (PET)",
                "Diamorphine (DIA)",
                "Levorphanol (LEV)",
                "Butorphanol (BUT)",
                "Nalbuphine (NAL)",
                "Pentazocine (PEN)",
                "Sufentanil (SUF)",
                "Remifentanil (REM)",
                "Alfentanil (ALF)",
                "Methorphan (MEO)",
                "Propoxyphene (PRO)"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil (ALF)",
                "Buprenorphine (BUP)",
                "Butorphanol (BUT)",
                "Carfentanil (CAR)",
                "Codeine (COD)",
                "Diamorphine (DIA)",
                "Fentanyl (FEN)",
                "Heroin (HER)",
                "Hydrocodone (HYD)",
                "Hydromorphone (HYM)",
                "Levorphanol (LEV)",
                "Meperidine (MEP)",
                "Methadone (MET)",
                "Methorphan (MEO)",
                "Morphine (MOR)",
                "Nalbuphine (NAL)",
                "Oxycodone (OXY)",
                "Oxymorphone (OXM)",
                "Pentazocine (PEN)",
                "Pethidine (PET)",
                "Propoxyphene (PRO)",
                "Remifentanil (REM)",
                "Sufentanil (SUF)",
                "Tramadol (TRA)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-35-turbo-16k",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine",
            "Heroin",
            "Codeine",
            "Fentanyl",
            "Oxycodone",
            "Hydrocodone",
            "Methadone",
            "Meperidine",
            "Tramadol",
            "Buprenorphine",
            "Methamphetamine",
            "Cocaine",
            "LSD",
            "Ketamine",
            "MDMA",
            "Psilocybin",
            "Marijuana",
            "Opium",
            "Kratom",
            "Kava",
            "Salvia divinorum",
            "Dextromethorphan",
            "Naloxone",
            "Naltrexone"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine",
                "Heroin",
                "Codeine",
                "Fentanyl",
                "Oxycodone",
                "Hydrocodone",
                "Methadone",
                "Meperidine",
                "Tramadol",
                "Buprenorphine",
                "Methamphetamine",
                "Cocaine",
                "LSD",
                "Ketamine",
                "MDMA",
                "Psilocybin",
                "Marijuana",
                "Opium",
                "Kratom",
                "Kava",
                "Salvia divinorum",
                "Dextromethorphan",
                "Naloxone",
                "Naltrexone"
            ],
            "mismatches": [],
            "true_referents": [
                "Buprenorphine",
                "Cocaine",
                "Codeine",
                "Dextromethorphan",
                "Fentanyl",
                "Heroin",
                "Hydrocodone",
                "Kava",
                "Ketamine",
                "Kratom",
                "LSD",
                "MDMA",
                "Marijuana",
                "Meperidine",
                "Methadone",
                "Methamphetamine",
                "Morphine",
                "Naloxone",
                "Naltrexone",
                "Opium",
                "Oxycodone",
                "Psilocybin",
                "Salvia divinorum",
                "Tramadol"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "o1-mini",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Heroin (Diacetylmorphine)",
            "Codeine",
            "Fentanyl (FEN)",
            "Oxycodone (OXY)",
            "Hydrocodone (HC)",
            "Methadone (MET)",
            "Buprenorphine (BUP)",
            "Meperidine (Demerol)",
            "Pentazocine (PTZ)",
            "Levorphanol",
            "Nalorphine",
            "Propoxyphene",
            "Tramadol (TRA)",
            "Sufentanil",
            "Remifentanil",
            "Alfentanil",
            "Oxymorphone (OXYM)",
            "Tilidine",
            "Etorphine",
            "Nalbuphine",
            "Butorphanol",
            "Dihydrocodeine",
            "Hydromorphone (HYD)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Heroin (Diacetylmorphine)",
                "Codeine",
                "Fentanyl (FEN)",
                "Oxycodone (OXY)",
                "Hydrocodone (HC)",
                "Methadone (MET)",
                "Buprenorphine (BUP)",
                "Meperidine (Demerol)",
                "Pentazocine (PTZ)",
                "Levorphanol",
                "Nalorphine",
                "Propoxyphene",
                "Tramadol (TRA)",
                "Sufentanil",
                "Remifentanil",
                "Alfentanil",
                "Oxymorphone (OXYM)",
                "Tilidine",
                "Etorphine",
                "Nalbuphine",
                "Butorphanol",
                "Dihydrocodeine",
                "Hydromorphone (HYD)"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil",
                "Buprenorphine (BUP)",
                "Butorphanol",
                "Codeine",
                "Dihydrocodeine",
                "Etorphine",
                "Fentanyl (FEN)",
                "Heroin (Diacetylmorphine)",
                "Hydrocodone (HC)",
                "Hydromorphone (HYD)",
                "Levorphanol",
                "Meperidine (Demerol)",
                "Methadone (MET)",
                "Morphine (MOR)",
                "Nalbuphine",
                "Nalorphine",
                "Oxycodone (OXY)",
                "Oxymorphone (OXYM)",
                "Pentazocine (PTZ)",
                "Propoxyphene",
                "Remifentanil",
                "Sufentanil",
                "Tilidine",
                "Tramadol (TRA)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-5-sonnet-v1:0",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Codeine (COD)",
            "Oxycodone (OXY)",
            "Hydrocodone (HYC)",
            "Fentanyl (FEN)",
            "Heroin (HER)",
            "Methadone (MTD)",
            "Buprenorphine (BUP)",
            "Hydromorphone (HMO)",
            "Oxymorphone (OMO)",
            "Tramadol (TRA)",
            "Tapentadol (TAP)",
            "Meperidine (MEP)",
            "Levorphanol (LEV)",
            "Sufentanil (SUF)",
            "Remifentanil (REM)",
            "Alfentanil (ALF)",
            "Carfentanil (CAR)",
            "Pentazocine (PEN)",
            "Nalbuphine (NAL)",
            "Butorphanol (BUT)",
            "Dezocine (DEZ)",
            "Opium (OPI)",
            "Dextropropoxyphene (DEX)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Codeine (COD)",
                "Oxycodone (OXY)",
                "Hydrocodone (HYC)",
                "Fentanyl (FEN)",
                "Heroin (HER)",
                "Methadone (MTD)",
                "Buprenorphine (BUP)",
                "Hydromorphone (HMO)",
                "Oxymorphone (OMO)",
                "Tramadol (TRA)",
                "Tapentadol (TAP)",
                "Meperidine (MEP)",
                "Levorphanol (LEV)",
                "Sufentanil (SUF)",
                "Remifentanil (REM)",
                "Alfentanil (ALF)",
                "Carfentanil (CAR)",
                "Pentazocine (PEN)",
                "Nalbuphine (NAL)",
                "Butorphanol (BUT)",
                "Dezocine (DEZ)",
                "Opium (OPI)",
                "Dextropropoxyphene (DEX)"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil (ALF)",
                "Buprenorphine (BUP)",
                "Butorphanol (BUT)",
                "Carfentanil (CAR)",
                "Codeine (COD)",
                "Dextropropoxyphene (DEX)",
                "Dezocine (DEZ)",
                "Fentanyl (FEN)",
                "Heroin (HER)",
                "Hydrocodone (HYC)",
                "Hydromorphone (HMO)",
                "Levorphanol (LEV)",
                "Meperidine (MEP)",
                "Methadone (MTD)",
                "Morphine (MOR)",
                "Nalbuphine (NAL)",
                "Opium (OPI)",
                "Oxycodone (OXY)",
                "Oxymorphone (OMO)",
                "Pentazocine (PEN)",
                "Remifentanil (REM)",
                "Sufentanil (SUF)",
                "Tapentadol (TAP)",
                "Tramadol (TRA)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-opus-v1:0",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOP)",
            "Codeine",
            "Heroin (Diacetylmorphine)",
            "Hydrocodone (Zohydro ER)",
            "Oxycodone (OxyContin)",
            "Fentanyl",
            "Hydromorphone (Dilaudid)",
            "Methadone",
            "Buprenorphine (Subutex)",
            "Oxymorphone (Opana)",
            "Meperidine (Demerol)",
            "Tramadol (Ultram)",
            "Tapentadol (Nucynta)",
            "Sufentanil (Dsuvia)",
            "Remifentanil (Ultiva)",
            "Pentazocine (Talwin)",
            "Propoxyphene (Darvon)",
            "Levorphanol",
            "Butorphanol (Stadol)",
            "Nalbuphine (Nubain)",
            "Dezocine",
            "Ketobemidone",
            "Dextropropoxyphene (Darvocet)",
            "Alphaprodine (Nisentil)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOP)",
                "Codeine",
                "Heroin (Diacetylmorphine)",
                "Hydrocodone (Zohydro ER)",
                "Oxycodone (OxyContin)",
                "Fentanyl",
                "Hydromorphone (Dilaudid)",
                "Methadone",
                "Buprenorphine (Subutex)",
                "Oxymorphone (Opana)",
                "Meperidine (Demerol)",
                "Tramadol (Ultram)",
                "Tapentadol (Nucynta)",
                "Sufentanil (Dsuvia)",
                "Remifentanil (Ultiva)",
                "Pentazocine (Talwin)",
                "Propoxyphene (Darvon)",
                "Levorphanol",
                "Butorphanol (Stadol)",
                "Nalbuphine (Nubain)",
                "Dezocine",
                "Ketobemidone",
                "Dextropropoxyphene (Darvocet)",
                "Alphaprodine (Nisentil)"
            ],
            "mismatches": [],
            "true_referents": [
                "Alphaprodine (Nisentil)",
                "Buprenorphine (Subutex)",
                "Butorphanol (Stadol)",
                "Codeine",
                "Dextropropoxyphene (Darvocet)",
                "Dezocine",
                "Fentanyl",
                "Heroin (Diacetylmorphine)",
                "Hydrocodone (Zohydro ER)",
                "Hydromorphone (Dilaudid)",
                "Ketobemidone",
                "Levorphanol",
                "Meperidine (Demerol)",
                "Methadone",
                "Morphine (MOP)",
                "Nalbuphine (Nubain)",
                "Oxycodone (OxyContin)",
                "Oxymorphone (Opana)",
                "Pentazocine (Talwin)",
                "Propoxyphene (Darvon)",
                "Remifentanil (Ultiva)",
                "Sufentanil (Dsuvia)",
                "Tapentadol (Nucynta)",
                "Tramadol (Ultram)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-sonnet-v1:0",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Codeine (COD)",
            "Oxycodone (OXY)",
            "Hydrocodone (HYD)",
            "Fentanyl (FEN)",
            "Heroin (HER)",
            "Methadone (MTD)",
            "Buprenorphine (BUP)",
            "Tramadol (TRA)",
            "Tapentadol (TAP)",
            "Oxymorphone (OXM)",
            "Hydromorphone (HYM)",
            "Meperidine (MEP)",
            "Propoxyphene (PRO)",
            "Pentazocine (PEN)",
            "Butorphanol (BUT)",
            "Nalbuphine (NAL)",
            "Remifentanil (REM)",
            "Sufentanil (SUF)",
            "Alfentanil (ALF)",
            "Carfentanil (CAR)",
            "Loperamide (LOP)",
            "Diphenoxylate (DIP)",
            "Paregoric (PAR)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Codeine (COD)",
                "Oxycodone (OXY)",
                "Hydrocodone (HYD)",
                "Fentanyl (FEN)",
                "Heroin (HER)",
                "Methadone (MTD)",
                "Buprenorphine (BUP)",
                "Tramadol (TRA)",
                "Tapentadol (TAP)",
                "Oxymorphone (OXM)",
                "Hydromorphone (HYM)",
                "Meperidine (MEP)",
                "Propoxyphene (PRO)",
                "Pentazocine (PEN)",
                "Butorphanol (BUT)",
                "Nalbuphine (NAL)",
                "Remifentanil (REM)",
                "Sufentanil (SUF)",
                "Alfentanil (ALF)",
                "Carfentanil (CAR)",
                "Loperamide (LOP)",
                "Diphenoxylate (DIP)",
                "Paregoric (PAR)"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil (ALF)",
                "Buprenorphine (BUP)",
                "Butorphanol (BUT)",
                "Carfentanil (CAR)",
                "Codeine (COD)",
                "Diphenoxylate (DIP)",
                "Fentanyl (FEN)",
                "Heroin (HER)",
                "Hydrocodone (HYD)",
                "Hydromorphone (HYM)",
                "Loperamide (LOP)",
                "Meperidine (MEP)",
                "Methadone (MTD)",
                "Morphine (MOR)",
                "Nalbuphine (NAL)",
                "Oxycodone (OXY)",
                "Oxymorphone (OXM)",
                "Paregoric (PAR)",
                "Pentazocine (PEN)",
                "Propoxyphene (PRO)",
                "Remifentanil (REM)",
                "Sufentanil (SUF)",
                "Tapentadol (TAP)",
                "Tramadol (TRA)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-haiku-v1:0",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Heroin (HER)",
            "Oxycodone (OXY)",
            "Fentanyl (FEN)",
            "Codeine (COD)",
            "Hydrocodone (HYD)",
            "Methadone (MET)",
            "Tramadol (TRA)",
            "Hydromorphone (HYM)",
            "Buprenorphine (BUP)",
            "Oxymorphone (OXM)",
            "Sufentanil (SUF)",
            "Remifentanil (REM)",
            "Alfentanil (ALF)",
            "Levorphanol (LEV)",
            "Meperidine (MEP)",
            "Propoxyphene (PRO)",
            "Pentazocine (PEN)",
            "Butorphanol (BUT)",
            "Nalbuphine (NAL)",
            "Dezocine (DEZ)",
            "Loperamide (LOP)",
            "Diphenoxylate (DIP)",
            "Lofexidine (LOF)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Heroin (HER)",
                "Oxycodone (OXY)",
                "Fentanyl (FEN)",
                "Codeine (COD)",
                "Hydrocodone (HYD)",
                "Methadone (MET)",
                "Tramadol (TRA)",
                "Hydromorphone (HYM)",
                "Buprenorphine (BUP)",
                "Oxymorphone (OXM)",
                "Sufentanil (SUF)",
                "Remifentanil (REM)",
                "Alfentanil (ALF)",
                "Levorphanol (LEV)",
                "Meperidine (MEP)",
                "Propoxyphene (PRO)",
                "Pentazocine (PEN)",
                "Butorphanol (BUT)",
                "Nalbuphine (NAL)",
                "Dezocine (DEZ)",
                "Loperamide (LOP)",
                "Diphenoxylate (DIP)",
                "Lofexidine (LOF)"
            ],
            "mismatches": [],
            "true_referents": [
                "Alfentanil (ALF)",
                "Buprenorphine (BUP)",
                "Butorphanol (BUT)",
                "Codeine (COD)",
                "Dezocine (DEZ)",
                "Diphenoxylate (DIP)",
                "Fentanyl (FEN)",
                "Heroin (HER)",
                "Hydrocodone (HYD)",
                "Hydromorphone (HYM)",
                "Levorphanol (LEV)",
                "Lofexidine (LOF)",
                "Loperamide (LOP)",
                "Meperidine (MEP)",
                "Methadone (MET)",
                "Morphine (MOR)",
                "Nalbuphine (NAL)",
                "Oxycodone (OXY)",
                "Oxymorphone (OXM)",
                "Pentazocine (PEN)",
                "Propoxyphene (PRO)",
                "Remifentanil (REM)",
                "Sufentanil (SUF)",
                "Tramadol (TRA)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "meta.llama3-8b-instruct-v1:0",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Morphine (MOR)",
            "Codeine",
            "Hydrocodone (HYC)",
            "Oxycodone (OXY)",
            "Fentanyl (FNT)",
            "Sufentanil (SUF)",
            "Remifentanil (REM)",
            "Pethidine (PET)",
            "Meperidine (MEP)",
            "Tramadol (TRA)",
            "Tapentadol (TAP)",
            "Codeine-phenylephrine",
            "Hydrocodone-acetaminophen",
            "Oxycodone-acetaminophen",
            "Fentanyl- citrate",
            "Sufentanil- citrate",
            "Remifentanil- citrate",
            "Pethidine- hydrochloride",
            "Meperidine- hydrochloride",
            "Tramadol- hydrochloride",
            "Tapentadol- hydrochloride",
            "Nalbuphine (NAL)",
            "Nalmefene (NMF)",
            "Naltrexone (NTX)",
            "Buprenorphine (BUP)",
            "Butorphanol (BUT)",
            "Pentazocine (PNT)",
            "Nalorphine (NAL)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Morphine (MOR)",
                "Codeine",
                "Hydrocodone (HYC)",
                "Oxycodone (OXY)",
                "Fentanyl (FNT)",
                "Sufentanil (SUF)",
                "Remifentanil (REM)",
                "Pethidine (PET)",
                "Meperidine (MEP)",
                "Tramadol (TRA)",
                "Tapentadol (TAP)",
                "Codeine-phenylephrine",
                "Hydrocodone-acetaminophen",
                "Oxycodone-acetaminophen",
                "Fentanyl- citrate",
                "Sufentanil- citrate",
                "Remifentanil- citrate",
                "Pethidine- hydrochloride",
                "Meperidine- hydrochloride",
                "Tramadol- hydrochloride",
                "Tapentadol- hydrochloride",
                "Nalbuphine (NAL)",
                "Nalmefene (NMF)",
                "Naltrexone (NTX)"
            ],
            "mismatches": [
                "Buprenorphine (BUP)",
                "Butorphanol (BUT)",
                "Pentazocine (PNT)",
                "Nalorphine (NAL)"
            ],
            "true_referents": [
                "Codeine",
                "Codeine-phenylephrine",
                "Fentanyl (FNT)",
                "Fentanyl- citrate",
                "Hydrocodone (HYC)",
                "Hydrocodone-acetaminophen",
                "Meperidine (MEP)",
                "Meperidine- hydrochloride",
                "Morphine (MOR)",
                "Nalbuphine (NAL)",
                "Nalmefene (NMF)",
                "Naltrexone (NTX)",
                "Oxycodone (OXY)",
                "Oxycodone-acetaminophen",
                "Pethidine (PET)",
                "Pethidine- hydrochloride",
                "Remifentanil (REM)",
                "Remifentanil- citrate",
                "Sufentanil (SUF)",
                "Sufentanil- citrate",
                "Tapentadol (TAP)",
                "Tapentadol- hydrochloride",
                "Tramadol (TRA)",
                "Tramadol- hydrochloride"
            ],
            "TP": 24,
            "FP": 4,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "meta.llama3-70b-instruct-v1:0",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Opium (OPIO)",
            "Morphine (MOR)",
            "Codeine (CODE)",
            "Heroin (HER)",
            "Fentanyl (FENT)",
            "Meperidine (MEPER)",
            "Hydromorphone (HYDRO)",
            "Oxycodone (OXY)",
            "Hydrocodone (HYDROCOD)",
            "Levorphanol (LEVOR)",
            "Oxymorphone (OXYMOR)",
            "Propoxyphene (PROPOX)",
            "Pentazocine (PENT)",
            "Butorphanol (BUTO)",
            "Buprenorphine (BUPREN)",
            "Nalbuphine (NALBUP)",
            "Etomidate (ETOMI)",
            "Sufentanil (SUFE)",
            "Alfentanil (ALFEN)",
            "Remifentanil (REMIF)",
            "Fentanyl analog (FENTAN)",
            "Carfentanil (CARFEN)",
            "Ah-7921 (AH7921)",
            "U-47700 (U47700)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Opium (OPIO)",
                "Morphine (MOR)",
                "Codeine (CODE)",
                "Heroin (HER)",
                "Fentanyl (FENT)",
                "Meperidine (MEPER)",
                "Hydromorphone (HYDRO)",
                "Oxycodone (OXY)",
                "Hydrocodone (HYDROCOD)",
                "Levorphanol (LEVOR)",
                "Oxymorphone (OXYMOR)",
                "Propoxyphene (PROPOX)",
                "Pentazocine (PENT)",
                "Butorphanol (BUTO)",
                "Buprenorphine (BUPREN)",
                "Nalbuphine (NALBUP)",
                "Etomidate (ETOMI)",
                "Sufentanil (SUFE)",
                "Alfentanil (ALFEN)",
                "Remifentanil (REMIF)",
                "Fentanyl analog (FENTAN)",
                "Carfentanil (CARFEN)",
                "Ah-7921 (AH7921)",
                "U-47700 (U47700)"
            ],
            "mismatches": [],
            "true_referents": [
                "Ah-7921 (AH7921)",
                "Alfentanil (ALFEN)",
                "Buprenorphine (BUPREN)",
                "Butorphanol (BUTO)",
                "Carfentanil (CARFEN)",
                "Codeine (CODE)",
                "Etomidate (ETOMI)",
                "Fentanyl (FENT)",
                "Fentanyl analog (FENTAN)",
                "Heroin (HER)",
                "Hydrocodone (HYDROCOD)",
                "Hydromorphone (HYDRO)",
                "Levorphanol (LEVOR)",
                "Meperidine (MEPER)",
                "Morphine (MOR)",
                "Nalbuphine (NALBUP)",
                "Opium (OPIO)",
                "Oxycodone (OXY)",
                "Oxymorphone (OXYMOR)",
                "Pentazocine (PENT)",
                "Propoxyphene (PROPOX)",
                "Remifentanil (REMIF)",
                "Sufentanil (SUFE)",
                "U-47700 (U47700)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "amazon.titan-text-express-v1",
        "concept": "narcotic agent",
        "domain": "medicine",
        "response": [
            "Anesthetics",
            "Benzodiazepines",
            "Cannabinoids",
            "Codeine",
            "Dextromethorphan",
            "Fentanyl",
            "Heroin",
            "LSD",
            "Morphine",
            "Nicotine",
            "Opium",
            "Opiates",
            "Pethidine",
            "Propoxyphene",
            "Psilocybin",
            "Sedatives",
            "Steroids",
            "Tramadol",
            "Tricyclic antidepressants",
            "Tryptamines",
            "Valium",
            "Xanax"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Anesthetics",
                "Benzodiazepines",
                "Cannabinoids",
                "Codeine",
                "Dextromethorphan",
                "Fentanyl",
                "Heroin",
                "LSD",
                "Morphine",
                "Nicotine",
                "Opium",
                "Opiates",
                "Pethidine",
                "Propoxyphene",
                "Psilocybin",
                "Sedatives",
                "Steroids",
                "Tramadol",
                "Tricyclic antidepressants",
                "Tryptamines",
                "Valium",
                "Xanax"
            ],
            "mismatches": [],
            "true_referents": [
                "Anesthetics",
                "Benzodiazepines",
                "Cannabinoids",
                "Codeine",
                "Dextromethorphan",
                "Fentanyl",
                "Heroin",
                "LSD",
                "Morphine",
                "Nicotine",
                "Opiates",
                "Opium",
                "Pethidine",
                "Propoxyphene",
                "Psilocybin",
                "Sedatives",
                "Steroids",
                "Tramadol",
                "Tricyclic antidepressants",
                "Tryptamines",
                "Valium",
                "Xanax"
            ],
            "TP": 22,
            "FP": 0,
            "FN": 0
        }
    }
]