{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "a780cf0b-c5ba-46dd-8703-48c123aa5684",
    "model": "open_lm_1b",
    "creation_date": "2024_03_05-02_58_34",
    "eval_metrics": {
        "perplexity": 2.605432198445002,
        "downstream_perpexity": {
            "mmlu": 1.6755261616376778,
            "hellaswag": 2.3097753316205565,
            "jeopardy_all": 1.7160292505183252,
            "triviaqa_sm_sub": 2.4002023626764615,
            "gsm8k": 1.8227565538892248,
            "agi_eval_sat_math": 1.543319081176411,
            "aqua": 2.230330982500193,
            "svamp": 2.723137406508128,
            "bigbench_qa_wikidata": 3.7089554719389684,
            "arc_easy": 2.2848324535370677,
            "arc_challenge": 2.4206284043274233,
            "bigbench_misconceptions": 4.433859031494349,
            "copa": 2.4284385442733765,
            "siqa": 1.6105699461704632,
            "commonsense_qa": 1.776991893188764,
            "piqa": 2.6442791441449405,
            "openbook_qa": 4.073792212486267,
            "bigbench_novel_concepts": 2.348542384803295,
            "bigbench_strange_stories": 3.752217510650898,
            "bigbench_strategy_qa": 1.8601578635490321,
            "lambada_openai": 1.386493268570442,
            "winograd_wsc": 2.4497444957167236,
            "winogrande": 3.03507230821894,
            "bigbench_conlang_translation": 1.9211284009421743,
            "bigbench_language_identification": 3.1042160247177764,
            "bigbench_conceptual_combinations": 0.9863664415276167,
            "bigbench_elementary_math_qa": 4.340047489590865,
            "bigbench_dyck_languages": 4.551590073347092,
            "agi_eval_lsat_ar": 1.6084557077159052,
            "bigbench_cs_algorithms": 4.607884274107037,
            "bigbench_logical_deduction": 1.0794487104415893,
            "bigbench_operators": 4.845767226105645,
            "bigbench_repeat_copy_logic": 1.3140942864120007,
            "simple_arithmetic_nospaces": 6.839018192768097,
            "simple_arithmetic_withspaces": 6.114085217475891,
            "math_qa": 3.7357610978418414,
            "logi_qa": 1.700169177099307,
            "pubmed_qa_labeled": 6.153861145496369,
            "squad": 2.3977528141823825,
            "agi_eval_lsat_rc": 1.7143935952613603,
            "agi_eval_lsat_lr": 1.8015430062424902,
            "coqa": 4.100707824735225,
            "bigbench_understanding_fables": 3.1973232844519237,
            "boolq": 3.202441118543666,
            "agi_eval_sat_en": 1.68611485113218,
            "winogender_mc_female": 1.1065640141566595,
            "winogender_mc_male": 0.9611418887972831,
            "enterprise_pii_classification": 4.397881984991599,
            "bbq": 0.27167081553196,
            "human_eval_return_complex": 2.32931486077196,
            "human_eval_return_simple": 5.1351813625644995,
            "human_eval-0.5": 2.2665285686167276,
            "human_eval-0.25": 2.3539176241653723,
            "human_eval-0.75": 2.3536366871217402,
            "human_eval": 2.4651356324916933,
            "processed_human_eval_cpp": 2.608204031582945,
            "processed_human_eval_js": 2.4751753429087198
        },
        "icl": {
            "mmlu_zeroshot": 0.26127135204641444,
            "hellaswag_zeroshot": 0.5928102135658264,
            "jeopardy": 0.26876194179058077,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0015163002535700798,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.5894395112991333,
            "arc_easy": 0.679713785648346,
            "arc_challenge": 0.3839590549468994,
            "bigbench_misconceptions": 0.47488585114479065,
            "copa": 0.7599999904632568,
            "siqa": 0.48874104022979736,
            "commonsense_qa": 0.3505323529243469,
            "piqa": 0.7486398220062256,
            "openbook_qa": 0.38999998569488525,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.5632184147834778,
            "bigbench_strategy_qa": 0.49759721755981445,
            "lambada_openai": 0.5742285847663879,
            "hellaswag": 0.5988847017288208,
            "winograd": 0.761904776096344,
            "winogrande": 0.5982636213302612,
            "bigbench_conlang_translation": 0.012195121496915817,
            "bigbench_language_identification": 0.249099999666214,
            "bigbench_conceptual_combinations": 0.24271844327449799,
            "bigbench_elementary_math_qa": 0.24004192650318146,
            "bigbench_dyck_languages": 0.20600000023841858,
            "agi_eval_lsat_ar": 0.269565224647522,
            "bigbench_cs_algorithms": 0.4515151381492615,
            "bigbench_logical_deduction": 0.2606666684150696,
            "bigbench_operators": 0.2142857164144516,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.008999999612569809,
            "simple_arithmetic_withspaces": 0.00800000037997961,
            "math_qa": 0.24371437728405,
            "logi_qa": 0.2780337929725647,
            "pubmed_qa_labeled": 0.39399999380111694,
            "squad": 0.3580889403820038,
            "agi_eval_lsat_rc": 0.2014925330877304,
            "agi_eval_lsat_lr": 0.22549019753932953,
            "coqa": 0.3090316951274872,
            "bigbench_understanding_fables": 0.24867725372314453,
            "boolq": 0.5886850357055664,
            "agi_eval_sat_en": 0.20873786509037018,
            "winogender_mc_female": 0.46666666865348816,
            "winogender_mc_male": 0.550000011920929,
            "enterprise_pii_classification": 0.4933726191520691,
            "bbq": 0.48014930161562835,
            "mmlu_fewshot": 0.2596522841537208,
            "gsm8k_cot": 0.006823351141065359,
            "agi_eval_sat_math_cot": 0.004545454401522875,
            "aqua_cot": 0.02448979578912258,
            "svamp_cot": 0.05666666850447655,
            "gpqa_main": 0.2254464328289032,
            "gpqa_diamond": 0.2222222238779068
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.26527972391404875,
        "language understanding": 0.2991548797809309,
        "reading comprehension": 0.127283953636754,
        "safety": -0.00490569932894272,
        "symbolic problem solving": 0.08398777110306503,
        "world knowledge": 0.1517993333378646
    },
    "aggregated_centered_results": 0.1564516143634475,
    "aggregated_results": 0.3413634016397293,
    "rw_small": 0.6005719502766927,
    "rw_small_centered": 0.3071272491711622,
    "95%_CI_above": 0.4492921950464899,
    "95%_CI_above_centered": 0.266953827331116,
    "99%_CI_above": 0.4600731611251831,
    "99%_CI_above_centered": 0.3105616379725306,
    "low_variance_datasets": 0.45197318602692,
    "low_variance_datasets_centered": 0.3152772651652282,
    "model_uuid": "bb8fa899-e173-446a-a1b4-64650c897d2a",
    "_filename": "exp_data/evals/evaluation_mix_rw_v2_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1_arxiv_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.3152772651652282,
    "Extended": 0.1564516143634475
}