{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "454cc2ea-b254-494d-965e-548bc2053c92",
    "model": "open_lm_7b_swiglutorch",
    "creation_date": "2024_06_02-11_36_49",
    "eval_metrics": {
        "icl": {
            "mmlu_zeroshot": 0.24998237741620918,
            "hellaswag_zeroshot": 0.7253535389900208,
            "jeopardy": 0.4594766616821289,
            "triviaqa_sm_sub": 0.4283333420753479,
            "gsm8k_cot": 0.013646702282130718,
            "agi_eval_sat_math_cot": 0.022727273404598236,
            "aqua_cot": 0.020408162847161293,
            "bigbench_qa_wikidata": 0.6917474269866943,
            "arc_easy": 0.7196969985961914,
            "arc_challenge": 0.430887371301651,
            "mmlu_fewshot": 0.2422943329601957,
            "bigbench_misconceptions": 0.4429223835468292,
            "copa": 0.8100000023841858,
            "siqa": 0.5025588274002075,
            "commonsense_qa": 0.19901719689369202,
            "piqa": 0.790533185005188,
            "openbook_qa": 0.42800000309944153,
            "bigbench_novel_concepts": 0.5625,
            "bigbench_strange_stories": 0.6321839094161987,
            "bigbench_strategy_qa": 0.5744866728782654,
            "lambada_openai": 0.6937706470489502,
            "hellaswag": 0.7325233817100525,
            "winograd": 0.8424908518791199,
            "winogrande": 0.6598263382911682,
            "bigbench_conlang_translation": 0.03658536449074745,
            "bigbench_language_identification": 0.25029999017715454,
            "bigbench_conceptual_combinations": 0.3300970792770386,
            "bigbench_elementary_math_qa": 0.24696016311645508,
            "bigbench_dyck_languages": 0.17900000512599945,
            "agi_eval_lsat_ar": 0.2130434811115265,
            "bigbench_cs_algorithms": 0.4537878930568695,
            "bigbench_logical_deduction": 0.2526666522026062,
            "bigbench_operators": 0.190476194024086,
            "bigbench_repeat_copy_logic": 0.0625,
            "simple_arithmetic_nospaces": 0.014000000432133675,
            "simple_arithmetic_withspaces": 0.014000000432133675,
            "math_qa": 0.24539054930210114,
            "logi_qa": 0.23655913770198822,
            "pubmed_qa_labeled": 0.45100000500679016,
            "squad": 0.44351938366889954,
            "agi_eval_lsat_rc": 0.26865673065185547,
            "agi_eval_lsat_lr": 0.272549033164978,
            "coqa": 0.40698984265327454,
            "bigbench_understanding_fables": 0.22751322388648987,
            "boolq": 0.6767584085464478,
            "agi_eval_sat_en": 0.291262149810791,
            "winogender_mc_female": 0.4833333194255829,
            "winogender_mc_male": 0.4000000059604645,
            "enterprise_pii_classification": 0.5119293332099915,
            "bbq": 0.4554606052962216,
            "gpqa_main": 0.2455357164144516,
            "gpqa_diamond": 0.20202019810676575,
            "svamp_cot": 0.09000000357627869
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.31973106356040726,
        "language understanding": 0.41177540489116926,
        "reading comprehension": 0.21459854984035096,
        "safety": -0.07463836805386978,
        "symbolic problem solving": 0.08082085946260113,
        "world knowledge": 0.2252624857565116
    },
    "aggregated_centered_results": 0.20255140421932213,
    "aggregated_results": 0.377872869017467,
    "rw_small": 0.6704204430182775,
    "rw_small_centered": 0.4317112432585822,
    "95%_CI_above": 0.49755113019378155,
    "95%_CI_above_centered": 0.33507936480558653,
    "99%_CI_above": 0.5118403543596682,
    "99%_CI_above_centered": 0.38296591099621596,
    "low_variance_datasets": 0.5027135819196701,
    "low_variance_datasets_centered": 0.3867547634287481,
    "model_uuid": "4a76a3a8-706b-4893-b8d9-366688ff3127",
    "_filename": "exp_data/evals/evaluation_rw_v2_cc_v3_f0.15_resiliparse_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.2-open_lm_7b_swiglutorch-warm=5000-lr=0p0003-wd=0p33-cd=3e-05-bs=2048-mult=1-seed=124-tokens=137788211200_heavy.json",
    "missing tasks": "[]",
    "Core": 0.3867547634287481,
    "Extended": 0.20255140421932213
}