{
    "name": "eval/heavy",
    "uuid": "b8eb61fb-1d7d-4028-9680-5eaae48aee20",
    "model": "meta-llama/Llama-2-7b-hf",
    "creation_date": "2024_03_27-02_29_19",
    "eval_metrics": {
        "icl": {
            "mmlu_zeroshot": 0.382670291944554,
            "hellaswag_zeroshot": 0.7498506307601929,
            "jeopardy": 0.530568516254425,
            "triviaqa_sm_sub": 0.5879999995231628,
            "gsm8k": 0.048521608114242554,
            "agi_eval_sat_math": 0.0181818176060915,
            "aqua": 0.0367346927523613,
            "bigbench_qa_wikidata": 0.7346587181091309,
            "arc_easy": 0.7693602442741394,
            "arc_challenge": 0.5093856453895569,
            "bigbench_misconceptions": 0.5707762837409973,
            "copa": 0.800000011920929,
            "siqa": 0.7343909740447998,
            "commonsense_qa": 0.6265356540679932,
            "piqa": 0.7948857545852661,
            "openbook_qa": 0.4259999990463257,
            "bigbench_novel_concepts": 0.5625,
            "bigbench_strange_stories": 0.7298850417137146,
            "bigbench_strategy_qa": 0.6037570834159851,
            "lambada_openai": 0.7362701296806335,
            "hellaswag": 0.7733519077301025,
            "winograd": 0.8534798622131348,
            "winogrande": 0.6937648057937622,
            "bigbench_conlang_translation": 0.11585365980863571,
            "bigbench_language_identification": 0.3271999955177307,
            "bigbench_conceptual_combinations": 0.5922330021858215,
            "bigbench_elementary_math_qa": 0.2790094316005707,
            "bigbench_dyck_languages": 0.34299999475479126,
            "agi_eval_lsat_ar": 0.2869565188884735,
            "bigbench_cs_algorithms": 0.4469696879386902,
            "bigbench_logical_deduction": 0.30799999833106995,
            "bigbench_operators": 0.4571428596973419,
            "bigbench_repeat_copy_logic": 0.21875,
            "simple_arithmetic_nospaces": 0.25999999046325684,
            "simple_arithmetic_withspaces": 0.2619999945163727,
            "math_qa": 0.27187395095825195,
            "logi_qa": 0.30568355321884155,
            "pubmed_qa_labeled": 0.6320000290870667,
            "squad": 0.6075685620307922,
            "agi_eval_lsat_rc": 0.3805970251560211,
            "agi_eval_lsat_lr": 0.35686275362968445,
            "coqa": 0.47037455439567566,
            "bigbench_understanding_fables": 0.29100528359413147,
            "boolq": 0.7972477078437805,
            "agi_eval_sat_en": 0.46116504073143005,
            "winogender_mc_female": 0.5333333611488342,
            "winogender_mc_male": 0.699999988079071,
            "enterprise_pii_classification": 0.6798232793807983,
            "bbq": 0.5796832929957997,
            "mmlu_fewshot": 0.458114252801527,
            "gsm8k_cot": 0.1205458715558052,
            "agi_eval_sat_math_cot": 0.05000000074505806,
            "aqua_cot": 0.040816325694322586,
            "svamp_cot": 0.28333333134651184,
            "gpqa_main": 0.234375,
            "gpqa_diamond": 0.23232322931289673
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.4599753230668675,
        "language understanding": 0.5033799782590029,
        "reading comprehension": 0.37418989969515487,
        "safety": 0.24641996080225165,
        "symbolic problem solving": 0.19758263407912974,
        "world knowledge": 0.3443084686332279
    },
    "aggregated_centered_results": 0.3413748235640836,
    "aggregated_results": 0.4821496807852426,
    "rw_small": 0.7199348409970602,
    "rw_small_centered": 0.5264610554042616,
    "95%_CI_above": 0.5922072144878375,
    "95%_CI_above_centered": 0.4604278430042799,
    "99%_CI_above": 0.6059976119062175,
    "99%_CI_above_centered": 0.5029035479114494,
    "low_variance_datasets": 0.5887873527678577,
    "low_variance_datasets_centered": 0.49249913608100654,
    "_filename": "exp_data/evals/evaluation_llama2_7b_openlm_heavy.json",
    "missing tasks": "[]",
    "Core": 0.49249913608100654,
    "Extended": 0.3413748235640836
}