{
    "name": "dclm/eval/heavy",
    "uuid": "a0dc2c19-3495-4a9d-8504-28d5b73cba4d",
    "model": "LLM360/CrystalCoder",
    "creation_date": "2024_06_25-03_34_14",
    "eval_metrics": {
        "icl": {
            "mmlu_zeroshot": 0.374084400503259,
            "hellaswag_zeroshot": 0.717486560344696,
            "jeopardy": 0.47946962118148806,
            "triviaqa_sm_sub": 0.46000000834465027,
            "gsm8k_cot": 0.11372251808643341,
            "agi_eval_sat_math_cot": 0.059090908616781235,
            "aqua_cot": 0.04897959157824516,
            "svamp_cot": 0.46000000834465027,
            "bigbench_qa_wikidata": 0.7292948365211487,
            "arc_easy": 0.7167508602142334,
            "arc_challenge": 0.424914687871933,
            "mmlu_fewshot": 0.4817048415803073,
            "bigbench_misconceptions": 0.5205479264259338,
            "copa": 0.7400000095367432,
            "siqa": 0.7205731868743896,
            "commonsense_qa": 0.7141687273979187,
            "piqa": 0.780739963054657,
            "openbook_qa": 0.38600000739097595,
            "bigbench_novel_concepts": 0.59375,
            "bigbench_strange_stories": 0.6436781883239746,
            "bigbench_strategy_qa": 0.5923984050750732,
            "lambada_openai": 0.5656898617744446,
            "hellaswag": 0.7333200573921204,
            "winograd": 0.8498168587684631,
            "winogrande": 0.6795580387115479,
            "bigbench_conlang_translation": 0.18292683362960815,
            "bigbench_language_identification": 0.32589998841285706,
            "bigbench_conceptual_combinations": 0.553398072719574,
            "bigbench_elementary_math_qa": 0.30120545625686646,
            "bigbench_dyck_languages": 0.41600000858306885,
            "agi_eval_lsat_ar": 0.27391305565834045,
            "bigbench_cs_algorithms": 0.6265151500701904,
            "bigbench_logical_deduction": 0.2933333218097687,
            "bigbench_operators": 0.5190476179122925,
            "bigbench_repeat_copy_logic": 0.46875,
            "simple_arithmetic_nospaces": 0.23899999260902405,
            "simple_arithmetic_withspaces": 0.2370000034570694,
            "math_qa": 0.27053302526474,
            "logi_qa": 0.29032257199287415,
            "pubmed_qa_labeled": 0.5339999794960022,
            "squad": 0.6300851702690125,
            "agi_eval_lsat_rc": 0.38805970549583435,
            "agi_eval_lsat_lr": 0.31960785388946533,
            "coqa": 0.44156330823898315,
            "bigbench_understanding_fables": 0.45502644777297974,
            "boolq": 0.7345565557479858,
            "agi_eval_sat_en": 0.4563106894493103,
            "winogender_mc_female": 0.6000000238418579,
            "winogender_mc_male": 0.5833333134651184,
            "enterprise_pii_classification": 0.7617083787918091,
            "bbq": 0.5514446849172766,
            "gpqa_main": 0.2299107164144516,
            "gpqa_diamond": 0.1818181872367859
        }
    },
    "missing tasks": "[]",
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.4263667662631253,
        "language understanding": 0.4672569049762027,
        "reading comprehension": 0.3606951883258788,
        "safety": 0.248243200508031,
        "symbolic problem solving": 0.24264506751407772,
        "world knowledge": 0.29221052439937817
    },
    "aggregated_centered_results": 0.33187367802374956,
    "aggregated_results": 0.48020773938334366,
    "rw_small": 0.6688777854045233,
    "rw_small_centered": 0.4365750324656392,
    "95%_CI_above": 0.5894522517726019,
    "95%_CI_above_centered": 0.4495536445926848,
    "99%_CI_above": 0.6053630136925241,
    "99%_CI_above_centered": 0.494576455011657,
    "low_variance_datasets": 0.5887973156842319,
    "low_variance_datasets_centered": 0.48073897986479613,
    "_filename": "exp_data/evals/evaluation_llm360_crystalcoder_heavy.json",
    "Core": 0.48073897986479613,
    "Extended": 0.33187367802374956
}