{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "8fe8eaae-06e5-460b-8972-3425e0d60e10",
    "model": "open_lm_1b",
    "creation_date": "2024_03_22-11_38_54",
    "eval_metrics": {
        "icl": {
            "mmlu_zeroshot": 0.24222436207428313,
            "hellaswag_zeroshot": 0.5929097533226013,
            "jeopardy": 0.19677524715662004,
            "triviaqa_sm_sub": 0.19966666400432587,
            "gsm8k": 0.02047005295753479,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.6102554202079773,
            "arc_easy": 0.625420868396759,
            "arc_challenge": 0.3302047848701477,
            "bigbench_misconceptions": 0.5068492889404297,
            "copa": 0.7300000190734863,
            "siqa": 0.48157626390457153,
            "commonsense_qa": 0.29893529415130615,
            "piqa": 0.7383025288581848,
            "openbook_qa": 0.3659999966621399,
            "bigbench_novel_concepts": 0.40625,
            "bigbench_strange_stories": 0.5114942789077759,
            "bigbench_strategy_qa": 0.5439056158065796,
            "lambada_openai": 0.5092179179191589,
            "hellaswag": 0.5927106142044067,
            "winograd": 0.7362637519836426,
            "winogrande": 0.5864246487617493,
            "bigbench_conlang_translation": 0.024390242993831635,
            "bigbench_language_identification": 0.24950000643730164,
            "bigbench_conceptual_combinations": 0.25242719054222107,
            "bigbench_elementary_math_qa": 0.23977987468242645,
            "bigbench_dyck_languages": 0.17599999904632568,
            "agi_eval_lsat_ar": 0.22173912823200226,
            "bigbench_cs_algorithms": 0.4363636374473572,
            "bigbench_logical_deduction": 0.23866666853427887,
            "bigbench_operators": 0.190476194024086,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.0,
            "math_qa": 0.2500838041305542,
            "logi_qa": 0.2457757294178009,
            "pubmed_qa_labeled": 0.2770000100135803,
            "squad": 0.35818353295326233,
            "agi_eval_lsat_rc": 0.2537313401699066,
            "agi_eval_lsat_lr": 0.2450980395078659,
            "coqa": 0.2695728540420532,
            "bigbench_understanding_fables": 0.2063492089509964,
            "boolq": 0.6076452732086182,
            "agi_eval_sat_en": 0.22815534472465515,
            "winogender_mc_female": 0.5,
            "winogender_mc_male": 0.4833333194255829,
            "enterprise_pii_classification": 0.5075110197067261,
            "bbq": 0.4691645286299966,
            "mmlu_fewshot": 0.25168947587933455,
            "gsm8k_cot": 0.018195603042840958,
            "agi_eval_sat_math_cot": 0.004545454401522875,
            "aqua_cot": 0.004081632476300001,
            "svamp_cot": 0.05999999865889549,
            "gpqa_main": 0.2366071492433548,
            "gpqa_diamond": 0.21212121844291687
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.2257882730680695,
        "language understanding": 0.28384364379018673,
        "reading comprehension": 0.12182387335454685,
        "safety": -0.01999556611884723,
        "symbolic problem solving": 0.06962102875113488,
        "world knowledge": 0.15514197211255107
    },
    "aggregated_centered_results": 0.14278111716263342,
    "aggregated_results": 0.3306334867579762,
    "rw_small": 0.5787675579388937,
    "rw_small_centered": 0.2775084620330766,
    "95%_CI_above": 0.42412269852281387,
    "95%_CI_above_centered": 0.23819687401900896,
    "99%_CI_above": 0.4329527171409648,
    "99%_CI_above_centered": 0.27893510269161675,
    "low_variance_datasets": 0.42831370322541756,
    "low_variance_datasets_centered": 0.2862660343775258,
    "model_uuid": "4e94390b-6f50-476b-8683-650846c654e7",
    "_filename": "exp_data/evals/evaluation_refinedweb_v2_keyfix_ask_llm_gpt4++_1024_th0_2_masked-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=42-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.2862660343775258,
    "Extended": 0.14278111716263342
}