[
    {
        "test": "limited-list-referents",
        "responder": "qwen-v2.5-14b-instruct",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Adrenocorticotropic hormone (ACTH)",
            "Angiotensin II",
            "Atrial natriuretic peptide (ANP)",
            "Beta-endorphin",
            "Cholecystokinin (CCK)",
            "Dopamine",
            "Erythropoietin (EPO)",
            "Follicle-stimulating hormone (FSH)",
            "Gastrin",
            "Ghrelin",
            "Glucagon",
            "Gonadotropin-releasing hormone (GnRH)",
            "Growth hormone (GH)",
            "Insulin",
            "Leptin",
            "Luteinizing hormone (LH)",
            "Melanocyte-stimulating hormone (MSH)",
            "Oxytocin",
            "Parathyroid hormone (PTH)",
            "Prolactin",
            "Prostaglandins",
            "Somatostatin",
            "Thrombopoietin (TPO)",
            "Vasopressin (Antidiuretic hormone, ADH)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Adrenocorticotropic hormone (ACTH)",
                "Angiotensin II",
                "Atrial natriuretic peptide (ANP)",
                "Beta-endorphin",
                "Cholecystokinin (CCK)",
                "Dopamine",
                "Erythropoietin (EPO)",
                "Follicle-stimulating hormone (FSH)",
                "Gastrin",
                "Ghrelin",
                "Glucagon",
                "Gonadotropin-releasing hormone (GnRH)",
                "Growth hormone (GH)",
                "Insulin",
                "Leptin",
                "Luteinizing hormone (LH)",
                "Melanocyte-stimulating hormone (MSH)",
                "Oxytocin",
                "Parathyroid hormone (PTH)",
                "Prolactin",
                "Prostaglandins",
                "Somatostatin",
                "Thrombopoietin (TPO)",
                "Vasopressin (Antidiuretic hormone, ADH)"
            ],
            "mismatches": [],
            "true_referents": [
                "Adrenocorticotropic hormone (ACTH)",
                "Angiotensin II",
                "Atrial natriuretic peptide (ANP)",
                "Beta-endorphin",
                "Cholecystokinin (CCK)",
                "Dopamine",
                "Erythropoietin (EPO)",
                "Follicle-stimulating hormone (FSH)",
                "Gastrin",
                "Ghrelin",
                "Glucagon",
                "Gonadotropin-releasing hormone (GnRH)",
                "Growth hormone (GH)",
                "Insulin",
                "Leptin",
                "Luteinizing hormone (LH)",
                "Melanocyte-stimulating hormone (MSH)",
                "Oxytocin",
                "Parathyroid hormone (PTH)",
                "Prolactin",
                "Prostaglandins",
                "Somatostatin",
                "Thrombopoietin (TPO)",
                "Vasopressin (Antidiuretic hormone, ADH)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "phi-v4",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Adrenaline (Epinephrine)",
            "Noradrenaline (Norepinephrine)",
            "Dopamine",
            "Serotonin (5-Hydroxytryptamine)",
            "Histamine",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Melatonin",
            "Catecholamines",
            "Insulin-like Growth Factor 1 (IGF-1)",
            "Growth Hormone (Somatotropin)",
            "Oxytocin",
            "Vasopressin (Antidiuretic Hormone, ADH)",
            "Endorphins",
            "Dynorphins",
            "Enkephalins",
            "Thyrotropin-Releasing Hormone (TRH)",
            "Gonadotropin-Releasing Hormone (GnRH)",
            "Growth Hormone-Releasing Hormone (GHRH)",
            "Somatostatin",
            "Corticotropin-Releasing Hormone (CRH)",
            "Substance P",
            "Neurokinin A"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Adrenaline (Epinephrine)",
                "Noradrenaline (Norepinephrine)",
                "Dopamine",
                "Serotonin (5-Hydroxytryptamine)",
                "Histamine",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Melatonin",
                "Catecholamines",
                "Insulin-like Growth Factor 1 (IGF-1)",
                "Growth Hormone (Somatotropin)",
                "Oxytocin",
                "Vasopressin (Antidiuretic Hormone, ADH)",
                "Endorphins",
                "Dynorphins",
                "Enkephalins",
                "Thyrotropin-Releasing Hormone (TRH)",
                "Gonadotropin-Releasing Hormone (GnRH)",
                "Growth Hormone-Releasing Hormone (GHRH)",
                "Somatostatin",
                "Corticotropin-Releasing Hormone (CRH)",
                "Substance P",
                "Neurokinin A"
            ],
            "mismatches": [],
            "true_referents": [
                "Adrenaline (Epinephrine)",
                "Catecholamines",
                "Corticotropin-Releasing Hormone (CRH)",
                "Dopamine",
                "Dynorphins",
                "Endorphins",
                "Enkephalins",
                "Gonadotropin-Releasing Hormone (GnRH)",
                "Growth Hormone (Somatotropin)",
                "Growth Hormone-Releasing Hormone (GHRH)",
                "Histamine",
                "Insulin-like Growth Factor 1 (IGF-1)",
                "Melatonin",
                "Neurokinin A",
                "Noradrenaline (Norepinephrine)",
                "Oxytocin",
                "Serotonin (5-Hydroxytryptamine)",
                "Somatostatin",
                "Substance P",
                "Thyrotropin-Releasing Hormone (TRH)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Vasopressin (Antidiuretic Hormone, ADH)"
            ],
            "TP": 23,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "mistral-small-instruct-24B",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Adrenaline (Epinephrine)",
            "Noradrenaline (Norepinephrine)",
            "Dopamine",
            "Serotonin (5-Hydroxytryptamine)",
            "Histamine",
            "Melatonin",
            "Thyroxine",
            "Triiodothyronine",
            "Oxytocin",
            "Vasopressin",
            "Insulin",
            "Glucagon",
            "Gastrin",
            "Somatostatin",
            "Secretin",
            "Cholecystokinin",
            "Gastrin-releasing peptide",
            "Neuropeptide Y",
            "Thyrotropin-releasing hormone",
            "Gonadotropin-releasing hormone",
            "Corticotropin-releasing hormone",
            "Growth hormone-releasing hormone",
            "Proopiomelanocortin",
            "Adrenocorticotropic hormone",
            "Melanocyte-stimulating hormone"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Adrenaline (Epinephrine)",
                "Noradrenaline (Norepinephrine)",
                "Dopamine",
                "Serotonin (5-Hydroxytryptamine)",
                "Histamine",
                "Melatonin",
                "Thyroxine",
                "Triiodothyronine",
                "Oxytocin",
                "Vasopressin",
                "Insulin",
                "Glucagon",
                "Gastrin",
                "Somatostatin",
                "Secretin",
                "Cholecystokinin",
                "Gastrin-releasing peptide",
                "Neuropeptide Y",
                "Thyrotropin-releasing hormone",
                "Gonadotropin-releasing hormone",
                "Corticotropin-releasing hormone",
                "Growth hormone-releasing hormone",
                "Proopiomelanocortin",
                "Adrenocorticotropic hormone"
            ],
            "mismatches": [
                "Melanocyte-stimulating hormone"
            ],
            "true_referents": [
                "Adrenaline (Epinephrine)",
                "Adrenocorticotropic hormone",
                "Cholecystokinin",
                "Corticotropin-releasing hormone",
                "Dopamine",
                "Gastrin",
                "Gastrin-releasing peptide",
                "Glucagon",
                "Gonadotropin-releasing hormone",
                "Growth hormone-releasing hormone",
                "Histamine",
                "Insulin",
                "Melatonin",
                "Neuropeptide Y",
                "Noradrenaline (Norepinephrine)",
                "Oxytocin",
                "Proopiomelanocortin",
                "Secretin",
                "Serotonin (5-Hydroxytryptamine)",
                "Somatostatin",
                "Thyrotropin-releasing hormone",
                "Thyroxine",
                "Triiodothyronine",
                "Vasopressin"
            ],
            "TP": 24,
            "FP": 1,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gemma-3",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Epinephrine (EPI)",
            "Norepinephrine (NE)",
            "Dopamine (DA)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Melatonin (MEL)",
            "Serotonin (5-HT)",
            "Histamine (HIST)",
            "Glutamate (GLU) - as a neurohormone",
            "Glycine (GLY) - as a neurohormone",
            "Gamma-aminobutyric acid (GABA) - as a neurohormone",
            "Tyrosine-derived catecholamines",
            "Tryptophan-derived serotonin",
            "Phenylalanine-derived tyrosine",
            "Histidine-derived histamine",
            "Aspartic acid-derived N-methyl-D-aspartate (NMDA)",
            "Glutamic acid-derived glutamate",
            "L-DOPA (precursor to dopamine)",
            "3,4-Dihydroxyphenylacetic acid (DOPAC)",
            "Homovanillic acid (HVA)",
            "3-O-Methylcatecholamines",
            "Methoxytryptamine",
            "5-Hydroxyindoleacetic acid (5-HIAA)",
            "N-Acetylserotonin"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Epinephrine (EPI)",
                "Norepinephrine (NE)",
                "Dopamine (DA)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Melatonin (MEL)",
                "Serotonin (5-HT)",
                "Histamine (HIST)",
                "Glutamate (GLU) - as a neurohormone",
                "Glycine (GLY) - as a neurohormone",
                "Gamma-aminobutyric acid (GABA) - as a neurohormone",
                "Tyrosine-derived catecholamines",
                "Tryptophan-derived serotonin",
                "Phenylalanine-derived tyrosine",
                "Histidine-derived histamine",
                "Aspartic acid-derived N-methyl-D-aspartate (NMDA)",
                "Glutamic acid-derived glutamate",
                "L-DOPA (precursor to dopamine)",
                "3,4-Dihydroxyphenylacetic acid (DOPAC)",
                "Homovanillic acid (HVA)",
                "3-O-Methylcatecholamines",
                "Methoxytryptamine",
                "5-Hydroxyindoleacetic acid (5-HIAA)",
                "N-Acetylserotonin"
            ],
            "mismatches": [],
            "true_referents": [
                "3,4-Dihydroxyphenylacetic acid (DOPAC)",
                "3-O-Methylcatecholamines",
                "5-Hydroxyindoleacetic acid (5-HIAA)",
                "Aspartic acid-derived N-methyl-D-aspartate (NMDA)",
                "Dopamine (DA)",
                "Epinephrine (EPI)",
                "Gamma-aminobutyric acid (GABA) - as a neurohormone",
                "Glutamate (GLU) - as a neurohormone",
                "Glutamic acid-derived glutamate",
                "Glycine (GLY) - as a neurohormone",
                "Histamine (HIST)",
                "Histidine-derived histamine",
                "Homovanillic acid (HVA)",
                "L-DOPA (precursor to dopamine)",
                "Melatonin (MEL)",
                "Methoxytryptamine",
                "N-Acetylserotonin",
                "Norepinephrine (NE)",
                "Phenylalanine-derived tyrosine",
                "Serotonin (5-HT)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Tryptophan-derived serotonin",
                "Tyrosine-derived catecholamines"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4o",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Epinephrine (EPI)",
            "Norepinephrine (NE)",
            "Dopamine (DA)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Serotonin (5-HT)",
            "Melatonin (MT)",
            "Histamine (HA)",
            "Glutamate (Glu)",
            "Gamma-aminobutyric acid (GABA)",
            "Taurine",
            "Ornithine",
            "Citrulline",
            "Arginine Vasopressin (AVP)",
            "Oxytocin (OT)",
            "Somatostatin (SS)",
            "Calcitonin (CT)",
            "Parathyroid Hormone (PTH)",
            "Adrenocorticotropic Hormone (ACTH)",
            "Luteinizing Hormone (LH)",
            "Follicle Stimulating Hormone (FSH)",
            "Growth Hormone (GH)",
            "Prolactin (PRL)",
            "Insulin"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Epinephrine (EPI)",
                "Norepinephrine (NE)",
                "Dopamine (DA)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Serotonin (5-HT)",
                "Melatonin (MT)",
                "Histamine (HA)",
                "Glutamate (Glu)",
                "Gamma-aminobutyric acid (GABA)",
                "Taurine",
                "Ornithine",
                "Citrulline",
                "Arginine Vasopressin (AVP)",
                "Oxytocin (OT)",
                "Somatostatin (SS)",
                "Calcitonin (CT)",
                "Parathyroid Hormone (PTH)",
                "Adrenocorticotropic Hormone (ACTH)",
                "Luteinizing Hormone (LH)",
                "Follicle Stimulating Hormone (FSH)",
                "Growth Hormone (GH)",
                "Prolactin (PRL)",
                "Insulin"
            ],
            "mismatches": [],
            "true_referents": [
                "Adrenocorticotropic Hormone (ACTH)",
                "Arginine Vasopressin (AVP)",
                "Calcitonin (CT)",
                "Citrulline",
                "Dopamine (DA)",
                "Epinephrine (EPI)",
                "Follicle Stimulating Hormone (FSH)",
                "Gamma-aminobutyric acid (GABA)",
                "Glutamate (Glu)",
                "Growth Hormone (GH)",
                "Histamine (HA)",
                "Insulin",
                "Luteinizing Hormone (LH)",
                "Melatonin (MT)",
                "Norepinephrine (NE)",
                "Ornithine",
                "Oxytocin (OT)",
                "Parathyroid Hormone (PTH)",
                "Prolactin (PRL)",
                "Serotonin (5-HT)",
                "Somatostatin (SS)",
                "Taurine",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4o-mini",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Insulin (INS)",
            "Glucagon (GCG)",
            "Somatostatin (SST)",
            "Thyrotropin-releasing hormone (TRH)",
            "Gonadotropin-releasing hormone (GnRH)",
            "Adrenocorticotropic hormone (ACTH)",
            "Vasopressin (AVP)",
            "Oxytocin (OXT)",
            "Calcitonin (CT)",
            "Parathyroid hormone (PTH)",
            "Epinephrine (EPI)",
            "Norepinephrine (NE)",
            "Serotonin (5-HT)",
            "Melatonin (MEL)",
            "Histamine (HIS)",
            "Angiotensin II (Ang II)",
            "Neuropeptide Y (NPY)",
            "Substance P (SP)",
            "Cholecystokinin (CCK)",
            "Gastrin (GAS)",
            "Corticotropin-releasing hormone (CRH)",
            "Leptin (LEP)",
            "Adiponectin (ADIPOQ)",
            "Corticosterone (CORT)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Insulin (INS)",
                "Glucagon (GCG)",
                "Somatostatin (SST)",
                "Thyrotropin-releasing hormone (TRH)",
                "Gonadotropin-releasing hormone (GnRH)",
                "Adrenocorticotropic hormone (ACTH)",
                "Vasopressin (AVP)",
                "Oxytocin (OXT)",
                "Calcitonin (CT)",
                "Parathyroid hormone (PTH)",
                "Epinephrine (EPI)",
                "Norepinephrine (NE)",
                "Serotonin (5-HT)",
                "Melatonin (MEL)",
                "Histamine (HIS)",
                "Angiotensin II (Ang II)",
                "Neuropeptide Y (NPY)",
                "Substance P (SP)",
                "Cholecystokinin (CCK)",
                "Gastrin (GAS)",
                "Corticotropin-releasing hormone (CRH)",
                "Leptin (LEP)",
                "Adiponectin (ADIPOQ)",
                "Corticosterone (CORT)"
            ],
            "mismatches": [],
            "true_referents": [
                "Adiponectin (ADIPOQ)",
                "Adrenocorticotropic hormone (ACTH)",
                "Angiotensin II (Ang II)",
                "Calcitonin (CT)",
                "Cholecystokinin (CCK)",
                "Corticosterone (CORT)",
                "Corticotropin-releasing hormone (CRH)",
                "Epinephrine (EPI)",
                "Gastrin (GAS)",
                "Glucagon (GCG)",
                "Gonadotropin-releasing hormone (GnRH)",
                "Histamine (HIS)",
                "Insulin (INS)",
                "Leptin (LEP)",
                "Melatonin (MEL)",
                "Neuropeptide Y (NPY)",
                "Norepinephrine (NE)",
                "Oxytocin (OXT)",
                "Parathyroid hormone (PTH)",
                "Serotonin (5-HT)",
                "Somatostatin (SST)",
                "Substance P (SP)",
                "Thyrotropin-releasing hormone (TRH)",
                "Vasopressin (AVP)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-4-32k",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Epinephrine (EPI)",
            "Norepinephrine (NE)",
            "Dopamine (DA)",
            "Serotonin (5-HT)",
            "Histamine (HIS)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Melatonin (MT)",
            "Glutamate (Glu)",
            "Glycine (Gly)",
            "Aspartate (Asp)",
            "Gamma-aminobutyric acid (GABA)",
            "Dihydroxyphenylalanine (DOPA)",
            "Tryptophan (Trp)",
            "Phenylalanine (Phe)",
            "Tyrosine (Tyr)",
            "Arginine (Arg)",
            "Lysine (Lys)",
            "Asparagine (Asn)",
            "Glutamine (Gln)",
            "Proline (Pro)",
            "Serine (Ser)",
            "Threonine (Thr)",
            "Cysteine (Cys)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Epinephrine (EPI)",
                "Norepinephrine (NE)",
                "Dopamine (DA)",
                "Serotonin (5-HT)",
                "Histamine (HIS)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Melatonin (MT)",
                "Glutamate (Glu)",
                "Glycine (Gly)",
                "Aspartate (Asp)",
                "Gamma-aminobutyric acid (GABA)",
                "Dihydroxyphenylalanine (DOPA)",
                "Tryptophan (Trp)",
                "Phenylalanine (Phe)",
                "Tyrosine (Tyr)",
                "Arginine (Arg)",
                "Lysine (Lys)",
                "Asparagine (Asn)",
                "Glutamine (Gln)",
                "Proline (Pro)",
                "Serine (Ser)",
                "Threonine (Thr)",
                "Cysteine (Cys)"
            ],
            "mismatches": [],
            "true_referents": [
                "Arginine (Arg)",
                "Asparagine (Asn)",
                "Aspartate (Asp)",
                "Cysteine (Cys)",
                "Dihydroxyphenylalanine (DOPA)",
                "Dopamine (DA)",
                "Epinephrine (EPI)",
                "Gamma-aminobutyric acid (GABA)",
                "Glutamate (Glu)",
                "Glutamine (Gln)",
                "Glycine (Gly)",
                "Histamine (HIS)",
                "Lysine (Lys)",
                "Melatonin (MT)",
                "Norepinephrine (NE)",
                "Phenylalanine (Phe)",
                "Proline (Pro)",
                "Serine (Ser)",
                "Serotonin (5-HT)",
                "Threonine (Thr)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Tryptophan (Trp)",
                "Tyrosine (Tyr)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "gpt-35-turbo-16k",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Epinephrine (EPI)",
            "Norepinephrine (NE)",
            "Dopamine (DA)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Melatonin (MEL)",
            "Serotonin (5-HT)",
            "Histamine (HIS)",
            "Gastrin (GAS)",
            "Insulin (INS)",
            "Glucagon (GLU)",
            "Adrenocorticotropic hormone (ACTH)",
            "Growth hormone (GH)",
            "Prolactin (PRL)",
            "Follicle-stimulating hormone (FSH)",
            "Luteinizing hormone (LH)",
            "Thyroid-stimulating hormone (TSH)",
            "Antidiuretic hormone (ADH)",
            "Oxytocin (OXY)",
            "Calcitonin (CT)",
            "Parathyroid hormone (PTH)",
            "Cortisol (COR)",
            "Aldosterone (ALD)",
            "Testosterone (TES)",
            "Estrogen (EST)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Epinephrine (EPI)",
                "Norepinephrine (NE)",
                "Dopamine (DA)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Melatonin (MEL)",
                "Serotonin (5-HT)",
                "Histamine (HIS)",
                "Gastrin (GAS)",
                "Insulin (INS)",
                "Glucagon (GLU)",
                "Adrenocorticotropic hormone (ACTH)",
                "Growth hormone (GH)",
                "Prolactin (PRL)",
                "Follicle-stimulating hormone (FSH)",
                "Luteinizing hormone (LH)",
                "Thyroid-stimulating hormone (TSH)",
                "Antidiuretic hormone (ADH)",
                "Oxytocin (OXY)",
                "Calcitonin (CT)",
                "Parathyroid hormone (PTH)",
                "Cortisol (COR)",
                "Aldosterone (ALD)",
                "Testosterone (TES)"
            ],
            "mismatches": [
                "Estrogen (EST)"
            ],
            "true_referents": [
                "Adrenocorticotropic hormone (ACTH)",
                "Aldosterone (ALD)",
                "Antidiuretic hormone (ADH)",
                "Calcitonin (CT)",
                "Cortisol (COR)",
                "Dopamine (DA)",
                "Epinephrine (EPI)",
                "Follicle-stimulating hormone (FSH)",
                "Gastrin (GAS)",
                "Glucagon (GLU)",
                "Growth hormone (GH)",
                "Histamine (HIS)",
                "Insulin (INS)",
                "Luteinizing hormone (LH)",
                "Melatonin (MEL)",
                "Norepinephrine (NE)",
                "Oxytocin (OXY)",
                "Parathyroid hormone (PTH)",
                "Prolactin (PRL)",
                "Serotonin (5-HT)",
                "Testosterone (TES)",
                "Thyroid-stimulating hormone (TSH)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)"
            ],
            "TP": 24,
            "FP": 1,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "o1-mini",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Epinephrine (E)",
            "Norepinephrine (NE)",
            "Dopamine (DA)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Melatonin",
            "Serotonin (5-HT)",
            "Histamine",
            "Nitric Oxide (NO)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Epinephrine (E)",
                "Norepinephrine (NE)",
                "Dopamine (DA)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Melatonin",
                "Serotonin (5-HT)",
                "Histamine",
                "Nitric Oxide (NO)"
            ],
            "mismatches": [],
            "true_referents": [
                "Dopamine (DA)",
                "Epinephrine (E)",
                "Histamine",
                "Melatonin",
                "Nitric Oxide (NO)",
                "Norepinephrine (NE)",
                "Serotonin (5-HT)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)"
            ],
            "TP": 9,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-5-sonnet-v1:0",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Epinephrine (EPI)",
            "Norepinephrine (NE)",
            "Dopamine (DA)",
            "Serotonin (5-HT)",
            "Melatonin (MLT)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Histamine",
            "Tryptamine",
            "Tyramine",
            "3,4-Dihydroxyphenylalanine (DOPA)",
            "Gamma-aminobutyric acid (GABA)",
            "Thyrotropin-releasing hormone (TRH)",
            "Gonadotropin-releasing hormone (GnRH)",
            "Oxytocin",
            "Vasopressin",
            "Melanocyte-stimulating hormone (MSH)",
            "Adrenocorticotropic hormone (ACTH)",
            "Calcitonin",
            "Gastrin",
            "Secretin",
            "Cholecystokinin (CCK)",
            "Glucagon",
            "Somatostatin"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Epinephrine (EPI)",
                "Norepinephrine (NE)",
                "Dopamine (DA)",
                "Serotonin (5-HT)",
                "Melatonin (MLT)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Histamine",
                "Tryptamine",
                "Tyramine",
                "3,4-Dihydroxyphenylalanine (DOPA)",
                "Gamma-aminobutyric acid (GABA)",
                "Thyrotropin-releasing hormone (TRH)",
                "Gonadotropin-releasing hormone (GnRH)",
                "Oxytocin",
                "Vasopressin",
                "Melanocyte-stimulating hormone (MSH)",
                "Adrenocorticotropic hormone (ACTH)",
                "Calcitonin",
                "Gastrin",
                "Secretin",
                "Cholecystokinin (CCK)",
                "Glucagon",
                "Somatostatin"
            ],
            "mismatches": [],
            "true_referents": [
                "3,4-Dihydroxyphenylalanine (DOPA)",
                "Adrenocorticotropic hormone (ACTH)",
                "Calcitonin",
                "Cholecystokinin (CCK)",
                "Dopamine (DA)",
                "Epinephrine (EPI)",
                "Gamma-aminobutyric acid (GABA)",
                "Gastrin",
                "Glucagon",
                "Gonadotropin-releasing hormone (GnRH)",
                "Histamine",
                "Melanocyte-stimulating hormone (MSH)",
                "Melatonin (MLT)",
                "Norepinephrine (NE)",
                "Oxytocin",
                "Secretin",
                "Serotonin (5-HT)",
                "Somatostatin",
                "Thyrotropin-releasing hormone (TRH)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Tryptamine",
                "Tyramine",
                "Vasopressin"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-opus-v1:0",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Epinephrine (EPI)",
            "Norepinephrine (NE)",
            "Dopamine (DA)",
            "Serotonin (5-HT)",
            "Melatonin",
            "Thyroid hormones (T3, T4)",
            "Calcitonin",
            "Adrenocorticotropic hormone (ACTH)",
            "Melanocyte-stimulating hormone (MSH)",
            "Growth hormone-releasing hormone (GHRH)",
            "Thyrotropin-releasing hormone (TRH)",
            "Gonadotropin-releasing hormone (GnRH)",
            "Corticotropin-releasing hormone (CRH)",
            "Somatostatin (SS)",
            "Vasopressin (AVP)",
            "Oxytocin",
            "Gastrin",
            "Cholecystokinin (CCK)",
            "Secretin",
            "Motilin",
            "Vasoactive intestinal peptide (VIP)",
            "Substance P",
            "Neuropeptide Y (NPY)",
            "Enkephalins"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Epinephrine (EPI)",
                "Norepinephrine (NE)",
                "Dopamine (DA)",
                "Serotonin (5-HT)",
                "Melatonin",
                "Thyroid hormones (T3, T4)",
                "Calcitonin",
                "Adrenocorticotropic hormone (ACTH)",
                "Melanocyte-stimulating hormone (MSH)",
                "Growth hormone-releasing hormone (GHRH)",
                "Thyrotropin-releasing hormone (TRH)",
                "Gonadotropin-releasing hormone (GnRH)",
                "Corticotropin-releasing hormone (CRH)",
                "Somatostatin (SS)",
                "Vasopressin (AVP)",
                "Oxytocin",
                "Gastrin",
                "Cholecystokinin (CCK)",
                "Secretin",
                "Motilin",
                "Vasoactive intestinal peptide (VIP)",
                "Substance P",
                "Neuropeptide Y (NPY)",
                "Enkephalins"
            ],
            "mismatches": [],
            "true_referents": [
                "Adrenocorticotropic hormone (ACTH)",
                "Calcitonin",
                "Cholecystokinin (CCK)",
                "Corticotropin-releasing hormone (CRH)",
                "Dopamine (DA)",
                "Enkephalins",
                "Epinephrine (EPI)",
                "Gastrin",
                "Gonadotropin-releasing hormone (GnRH)",
                "Growth hormone-releasing hormone (GHRH)",
                "Melanocyte-stimulating hormone (MSH)",
                "Melatonin",
                "Motilin",
                "Neuropeptide Y (NPY)",
                "Norepinephrine (NE)",
                "Oxytocin",
                "Secretin",
                "Serotonin (5-HT)",
                "Somatostatin (SS)",
                "Substance P",
                "Thyroid hormones (T3, T4)",
                "Thyrotropin-releasing hormone (TRH)",
                "Vasoactive intestinal peptide (VIP)",
                "Vasopressin (AVP)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-sonnet-v1:0",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Melatonin (MLT)",
            "Epinephrine (EPI)",
            "Norepinephrine (NE)",
            "Dopamine (DA)",
            "Serotonin (5-HT)",
            "Histamine (HA)",
            "Thyroid hormones (T3, T4)",
            "Catecholamines (EPI, NE, DA)",
            "Tryptamine (TRY)",
            "Tyramine (TYR)",
            "Phenethylamine (PEA)",
            "Octopamine (OCT)",
            "Taurine (TAU)",
            "Carnosine (CAR)",
            "Anserine (ANS)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Reverse triiodothyronine (rT3)",
            "Diiodotyrosine (DIT)",
            "Monoiodotyrosine (MIT)",
            "Thyrotropin-releasing hormone (TRH)",
            "Melanocyte-stimulating hormone (MSH)",
            "Adrenocorticotropic hormone (ACTH)",
            "Parathyroid hormone (PTH)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Melatonin (MLT)",
                "Epinephrine (EPI)",
                "Norepinephrine (NE)",
                "Dopamine (DA)",
                "Serotonin (5-HT)",
                "Histamine (HA)",
                "Thyroid hormones (T3, T4)",
                "Catecholamines (EPI, NE, DA)",
                "Tryptamine (TRY)",
                "Tyramine (TYR)",
                "Phenethylamine (PEA)",
                "Octopamine (OCT)",
                "Taurine (TAU)",
                "Carnosine (CAR)",
                "Anserine (ANS)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Reverse triiodothyronine (rT3)",
                "Diiodotyrosine (DIT)",
                "Monoiodotyrosine (MIT)",
                "Thyrotropin-releasing hormone (TRH)",
                "Melanocyte-stimulating hormone (MSH)",
                "Adrenocorticotropic hormone (ACTH)",
                "Parathyroid hormone (PTH)"
            ],
            "mismatches": [],
            "true_referents": [
                "Adrenocorticotropic hormone (ACTH)",
                "Anserine (ANS)",
                "Carnosine (CAR)",
                "Catecholamines (EPI, NE, DA)",
                "Diiodotyrosine (DIT)",
                "Dopamine (DA)",
                "Epinephrine (EPI)",
                "Histamine (HA)",
                "Melanocyte-stimulating hormone (MSH)",
                "Melatonin (MLT)",
                "Monoiodotyrosine (MIT)",
                "Norepinephrine (NE)",
                "Octopamine (OCT)",
                "Parathyroid hormone (PTH)",
                "Phenethylamine (PEA)",
                "Reverse triiodothyronine (rT3)",
                "Serotonin (5-HT)",
                "Taurine (TAU)",
                "Thyroid hormones (T3, T4)",
                "Thyrotropin-releasing hormone (TRH)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Tryptamine (TRY)",
                "Tyramine (TYR)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "anthropic.claude-3-haiku-v1:0",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Adrenaline (E)",
            "Aldosterone (ALDO)",
            "Angiotensin II (ANG II)",
            "Antidiuretic Hormone (ADH)",
            "Calcitonin (CT)",
            "Cholecystokinin (CCK)",
            "Corticosterone (CORT)",
            "Cortisol (CORT)",
            "Dopamine (DA)",
            "Enkephalins (ENK)",
            "Epinephrine (E)",
            "Gastrin (G)",
            "Glucagon (GLC)",
            "Histamine (HIS)",
            "Insulin (INS)",
            "Melatonin (MEL)",
            "Norepinephrine (NE)",
            "Oxytocin (OXT)",
            "Parathyroid Hormone (PTH)",
            "Prolactin (PRL)",
            "Serotonin (5-HT)",
            "Substance P (SP)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Adrenaline (E)",
                "Aldosterone (ALDO)",
                "Angiotensin II (ANG II)",
                "Antidiuretic Hormone (ADH)",
                "Calcitonin (CT)",
                "Cholecystokinin (CCK)",
                "Corticosterone (CORT)",
                "Cortisol (CORT)",
                "Dopamine (DA)",
                "Enkephalins (ENK)",
                "Epinephrine (E)",
                "Gastrin (G)",
                "Glucagon (GLC)",
                "Histamine (HIS)",
                "Insulin (INS)",
                "Melatonin (MEL)",
                "Norepinephrine (NE)",
                "Oxytocin (OXT)",
                "Parathyroid Hormone (PTH)",
                "Prolactin (PRL)",
                "Serotonin (5-HT)",
                "Substance P (SP)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)"
            ],
            "mismatches": [],
            "true_referents": [
                "Adrenaline (E)",
                "Aldosterone (ALDO)",
                "Angiotensin II (ANG II)",
                "Antidiuretic Hormone (ADH)",
                "Calcitonin (CT)",
                "Cholecystokinin (CCK)",
                "Corticosterone (CORT)",
                "Cortisol (CORT)",
                "Dopamine (DA)",
                "Enkephalins (ENK)",
                "Epinephrine (E)",
                "Gastrin (G)",
                "Glucagon (GLC)",
                "Histamine (HIS)",
                "Insulin (INS)",
                "Melatonin (MEL)",
                "Norepinephrine (NE)",
                "Oxytocin (OXT)",
                "Parathyroid Hormone (PTH)",
                "Prolactin (PRL)",
                "Serotonin (5-HT)",
                "Substance P (SP)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "meta.llama3-8b-instruct-v1:0",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Adrenaline (Epinephrine) (EPI)",
            "Adrenocorticotropic hormone (ACTH)",
            "Androstenedione",
            "Androstenediol",
            "Aldosterone",
            "Allopregnanolone",
            "Androstenol",
            "Androstenone",
            "Cortisol",
            "Cortisone",
            "Dehydroepiandrosterone (DHEA)",
            "Dehydroepiandrosterone sulfate (DHEAS)",
            "Estradiol",
            "Estrone",
            "Estriol",
            "Estrogen",
            "Growth hormone (GH)",
            "Insulin-like growth factor-1 (IGF-1)",
            "Melatonin",
            "Oxytocin",
            "Progesterone",
            "Prolactin",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Adrenaline (Epinephrine) (EPI)",
                "Adrenocorticotropic hormone (ACTH)",
                "Androstenedione",
                "Androstenediol",
                "Aldosterone",
                "Allopregnanolone",
                "Androstenol",
                "Androstenone",
                "Cortisol",
                "Cortisone",
                "Dehydroepiandrosterone (DHEA)",
                "Dehydroepiandrosterone sulfate (DHEAS)",
                "Estradiol",
                "Estrone",
                "Estriol",
                "Estrogen",
                "Growth hormone (GH)",
                "Insulin-like growth factor-1 (IGF-1)",
                "Melatonin",
                "Oxytocin",
                "Progesterone",
                "Prolactin",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)"
            ],
            "mismatches": [],
            "true_referents": [
                "Adrenaline (Epinephrine) (EPI)",
                "Adrenocorticotropic hormone (ACTH)",
                "Aldosterone",
                "Allopregnanolone",
                "Androstenediol",
                "Androstenedione",
                "Androstenol",
                "Androstenone",
                "Cortisol",
                "Cortisone",
                "Dehydroepiandrosterone (DHEA)",
                "Dehydroepiandrosterone sulfate (DHEAS)",
                "Estradiol",
                "Estriol",
                "Estrogen",
                "Estrone",
                "Growth hormone (GH)",
                "Insulin-like growth factor-1 (IGF-1)",
                "Melatonin",
                "Oxytocin",
                "Progesterone",
                "Prolactin",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)"
            ],
            "TP": 24,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "meta.llama3-70b-instruct-v1:0",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": [
            "Adrenaline (ADL)",
            "Noradrenaline (NOR)",
            "Dopamine (DA)",
            "Thyroxine (T4)",
            "Triiodothyronine (T3)",
            "Melatonin",
            "Serotonin (5-HT)",
            "Histamine",
            "Thyroid-releasing hormone (TRH)",
            "Adrenocorticotropic hormone (ACTH)",
            "Alpha-melanocyte-stimulating hormone (\u03b1-MSH)",
            "Gamma-melanocyte-stimulating hormone (\u03b3-MSH)",
            "\u03b2-Endorphin",
            "Dynorphin",
            "Enkephalins",
            "Neuropeptide Y (NPY)",
            "Neurotensin",
            "Substance P",
            "Gastrin",
            "Cholecystokinin (CCK)",
            "Secretin",
            "Motilin",
            "Vasoactive intestinal peptide (VIP)"
        ],
        "scorer": "gpt-4o",
        "judgement": {
            "matches": [
                "Adrenaline (ADL)",
                "Noradrenaline (NOR)",
                "Dopamine (DA)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Melatonin",
                "Serotonin (5-HT)",
                "Histamine",
                "Thyroid-releasing hormone (TRH)",
                "Adrenocorticotropic hormone (ACTH)",
                "Alpha-melanocyte-stimulating hormone (\u03b1-MSH)",
                "Gamma-melanocyte-stimulating hormone (\u03b3-MSH)",
                "\u03b2-Endorphin",
                "Dynorphin",
                "Enkephalins",
                "Neuropeptide Y (NPY)",
                "Neurotensin",
                "Substance P",
                "Gastrin",
                "Cholecystokinin (CCK)",
                "Secretin",
                "Motilin",
                "Vasoactive intestinal peptide (VIP)"
            ],
            "mismatches": [],
            "true_referents": [
                "Adrenaline (ADL)",
                "Adrenocorticotropic hormone (ACTH)",
                "Alpha-melanocyte-stimulating hormone (\u03b1-MSH)",
                "Cholecystokinin (CCK)",
                "Dopamine (DA)",
                "Dynorphin",
                "Enkephalins",
                "Gamma-melanocyte-stimulating hormone (\u03b3-MSH)",
                "Gastrin",
                "Histamine",
                "Melatonin",
                "Motilin",
                "Neuropeptide Y (NPY)",
                "Neurotensin",
                "Noradrenaline (NOR)",
                "Secretin",
                "Serotonin (5-HT)",
                "Substance P",
                "Thyroid-releasing hormone (TRH)",
                "Thyroxine (T4)",
                "Triiodothyronine (T3)",
                "Vasoactive intestinal peptide (VIP)",
                "\u03b2-Endorphin"
            ],
            "TP": 23,
            "FP": 0,
            "FN": 0
        }
    },
    {
        "test": "limited-list-referents",
        "responder": "amazon.titan-text-express-v1",
        "concept": "amino acid-derived hormone",
        "domain": "biology",
        "response": "error when calling model 'amazon.titan-text-express-v1': empty response",
        "scorer": "gpt-4o",
        "judgement": {
            "TP": 0,
            "FP": 0
        }
    }
]