{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "44d741a3-c10b-4595-b53f-9939bdfde591",
    "model": "open_lm_1b_swiglutorch",
    "creation_date": "2024_06_03-20_56_46",
    "eval_metrics": {
        "icl": {
            "mmlu_zeroshot": 0.26106080569719015,
            "hellaswag_zeroshot": 0.541226863861084,
            "jeopardy": 0.20979372262954712,
            "triviaqa_sm_sub": 0.1666666716337204,
            "gsm8k_cot": 0.005307050887495279,
            "agi_eval_sat_math_cot": 0.004545454401522875,
            "aqua_cot": 0.004081632476300001,
            "svamp_cot": 0.0533333346247673,
            "bigbench_qa_wikidata": 0.5797942876815796,
            "arc_easy": 0.628367006778717,
            "arc_challenge": 0.3233788311481476,
            "mmlu_fewshot": 0.2520745475041239,
            "bigbench_misconceptions": 0.4840182662010193,
            "copa": 0.6600000262260437,
            "siqa": 0.5143295526504517,
            "commonsense_qa": 0.19656018912792206,
            "piqa": 0.7268770337104797,
            "openbook_qa": 0.3799999952316284,
            "bigbench_novel_concepts": 0.46875,
            "bigbench_strange_stories": 0.540229856967926,
            "bigbench_strategy_qa": 0.5321100950241089,
            "lambada_openai": 0.5654958486557007,
            "hellaswag": 0.5525791645050049,
            "winograd": 0.7289377450942993,
            "winogrande": 0.5619573593139648,
            "bigbench_conlang_translation": 0.018292682245373726,
            "bigbench_language_identification": 0.25429999828338623,
            "bigbench_conceptual_combinations": 0.3009708821773529,
            "bigbench_elementary_math_qa": 0.25162473320961,
            "bigbench_dyck_languages": 0.2840000092983246,
            "agi_eval_lsat_ar": 0.22173912823200226,
            "bigbench_cs_algorithms": 0.4151515066623688,
            "bigbench_logical_deduction": 0.25466665625572205,
            "bigbench_operators": 0.16190476715564728,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.003000000026077032,
            "simple_arithmetic_withspaces": 0.0020000000949949026,
            "math_qa": 0.2541066110134125,
            "logi_qa": 0.2457757294178009,
            "pubmed_qa_labeled": 0.4009999930858612,
            "squad": 0.30804163217544556,
            "agi_eval_lsat_rc": 0.25746268033981323,
            "agi_eval_lsat_lr": 0.2862745225429535,
            "coqa": 0.24902918934822083,
            "bigbench_understanding_fables": 0.2698412835597992,
            "boolq": 0.574923574924469,
            "agi_eval_sat_en": 0.23786407709121704,
            "winogender_mc_female": 0.5666666626930237,
            "winogender_mc_male": 0.4333333373069763,
            "enterprise_pii_classification": 0.5331369638442993,
            "bbq": 0.48687340725551953,
            "gpqa_main": 0.2299107164144516,
            "gpqa_diamond": 0.2222222238779068
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.21190793587728918,
        "language understanding": 0.2756161562521093,
        "reading comprehension": 0.1362800651082867,
        "safety": 0.010005185549909423,
        "symbolic problem solving": 0.07518741561410328,
        "world knowledge": 0.1480310056240935
    },
    "aggregated_centered_results": 0.14412406208591705,
    "aggregated_results": 0.33331298699178824,
    "rw_small": 0.5483089437087377,
    "rw_small_centered": 0.21932047814653632,
    "95%_CI_above": 0.42279910544876925,
    "95%_CI_above_centered": 0.23129876496583926,
    "99%_CI_above": 0.42625629733438075,
    "99%_CI_above_centered": 0.26381233350226757,
    "low_variance_datasets": 0.4147299036383629,
    "low_variance_datasets_centered": 0.26272989603116864,
    "model_uuid": "b15142fe-d83d-4412-83c2-593033869819",
    "_filename": "exp_data/evals/evaluation_rw_v2_cc_v3_f0.15_resiliparse_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_1b_swiglutorch-warm=5000-lr=0p0003-wd=0p33-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.26272989603116864,
    "Extended": 0.14412406208591705
}