{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "049d492d-d867-4877-89dc-26b023789166",
    "model": "open_lm_1b",
    "creation_date": "2024_03_05-04_24_45",
    "eval_metrics": {
        "perplexity": 2.6035890420277914,
        "downstream_perpexity": {
            "mmlu": 1.5648463380640685,
            "hellaswag": 2.3285901234031696,
            "jeopardy_all": 1.7649594261066837,
            "triviaqa_sm_sub": 2.394010978251696,
            "gsm8k": 1.797184363561114,
            "agi_eval_sat_math": 1.5417313543233004,
            "aqua": 2.2422773526639355,
            "svamp": 2.5962975001335145,
            "bigbench_qa_wikidata": 3.5421279427181998,
            "arc_easy": 2.361810887150893,
            "arc_challenge": 2.478478506207466,
            "bigbench_misconceptions": 4.5270674163348055,
            "copa": 2.432545735836029,
            "siqa": 1.3475983329536727,
            "commonsense_qa": 1.6308880357054976,
            "piqa": 2.659090420756169,
            "openbook_qa": 4.078449437618255,
            "bigbench_novel_concepts": 2.3476900085806847,
            "bigbench_strange_stories": 3.1854450545091737,
            "bigbench_strategy_qa": 1.8508252864740153,
            "lambada_openai": 1.3520145168169746,
            "winograd_wsc": 2.4727259009749023,
            "winogrande": 3.0401146415659266,
            "bigbench_conlang_translation": 1.9787100851535797,
            "bigbench_language_identification": 3.9047495559063594,
            "bigbench_conceptual_combinations": 0.9171192402978545,
            "bigbench_elementary_math_qa": 3.8391068028641446,
            "bigbench_dyck_languages": 4.965156291484833,
            "agi_eval_lsat_ar": 1.6015775354012198,
            "bigbench_cs_algorithms": 5.195000061121854,
            "bigbench_logical_deduction": 0.9783340421915054,
            "bigbench_operators": 4.873017852646964,
            "bigbench_repeat_copy_logic": 1.41302927210927,
            "simple_arithmetic_nospaces": 6.588399987220765,
            "simple_arithmetic_withspaces": 6.2946311950683596,
            "math_qa": 3.5913428811071064,
            "logi_qa": 1.6720534497142387,
            "pubmed_qa_labeled": 7.2643349442481995,
            "squad": 2.2953605801797523,
            "agi_eval_lsat_rc": 1.719688900371096,
            "agi_eval_lsat_lr": 1.7651244743197572,
            "coqa": 3.2947230134170993,
            "bigbench_understanding_fables": 3.011508923989755,
            "boolq": 3.495164211518174,
            "agi_eval_sat_en": 1.7139148885763964,
            "winogender_mc_female": 1.1057605614264807,
            "winogender_mc_male": 0.9022531459728876,
            "enterprise_pii_classification": 4.686467461593021,
            "bbq": 0.24496919948207463,
            "human_eval_return_complex": 2.342168984450693,
            "human_eval_return_simple": 5.7398624291291105,
            "human_eval-0.5": 2.275969756812584,
            "human_eval-0.25": 2.3538552079258896,
            "human_eval-0.75": 2.3723589836097343,
            "human_eval": 2.443407028186612,
            "processed_human_eval_cpp": 2.554567267435678,
            "processed_human_eval_js": 2.2522222450593623
        },
        "icl": {
            "mmlu_zeroshot": 0.268989335810929,
            "hellaswag_zeroshot": 0.5890260934829712,
            "jeopardy": 0.26425160467624664,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.601446807384491,
            "arc_easy": 0.6670875549316406,
            "arc_challenge": 0.3771331012248993,
            "bigbench_misconceptions": 0.4794520437717438,
            "copa": 0.7400000095367432,
            "siqa": 0.5051177144050598,
            "commonsense_qa": 0.24078623950481415,
            "piqa": 0.7491838932037354,
            "openbook_qa": 0.3840000033378601,
            "bigbench_novel_concepts": 0.46875,
            "bigbench_strange_stories": 0.5517241358757019,
            "bigbench_strategy_qa": 0.5089558959007263,
            "lambada_openai": 0.5742285847663879,
            "hellaswag": 0.5933080911636353,
            "winograd": 0.7985348105430603,
            "winogrande": 0.5832675695419312,
            "bigbench_conlang_translation": 0.018292682245373726,
            "bigbench_language_identification": 0.2612000107765198,
            "bigbench_conceptual_combinations": 0.28155338764190674,
            "bigbench_elementary_math_qa": 0.24630503356456757,
            "bigbench_dyck_languages": 0.18799999356269836,
            "agi_eval_lsat_ar": 0.23043477535247803,
            "bigbench_cs_algorithms": 0.44621211290359497,
            "bigbench_logical_deduction": 0.2540000081062317,
            "bigbench_operators": 0.22857142984867096,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.004000000189989805,
            "simple_arithmetic_withspaces": 0.003000000026077032,
            "math_qa": 0.2628226578235626,
            "logi_qa": 0.2857142984867096,
            "pubmed_qa_labeled": 0.38999998569488525,
            "squad": 0.36556291580200195,
            "agi_eval_lsat_rc": 0.29104477167129517,
            "agi_eval_lsat_lr": 0.25882354378700256,
            "coqa": 0.2967556118965149,
            "bigbench_understanding_fables": 0.22751322388648987,
            "boolq": 0.6256880760192871,
            "agi_eval_sat_en": 0.2330097109079361,
            "winogender_mc_female": 0.550000011920929,
            "winogender_mc_male": 0.5,
            "enterprise_pii_classification": 0.561708390712738,
            "bbq": 0.4814477942206643,
            "mmlu_fewshot": 0.24899953290035848,
            "gsm8k_cot": 0.006823351141065359,
            "agi_eval_sat_math_cot": 0.004545454401522875,
            "aqua_cot": 0.008163264952600002,
            "svamp_cot": 0.0533333346247673,
            "gpqa_main": 0.2388392835855484,
            "gpqa_diamond": 0.2373737394809723
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.24042029842070636,
        "language understanding": 0.3119018574117717,
        "reading comprehension": 0.1582950424860444,
        "safety": 0.04657809842716565,
        "symbolic problem solving": 0.08040856476618002,
        "world knowledge": 0.1542832563513119
    },
    "aggregated_centered_results": 0.16264549085401253,
    "aggregated_results": 0.34406255114202444,
    "rw_small": 0.597033734122912,
    "rw_small_centered": 0.3065276853522362,
    "95%_CI_above": 0.44720032254790326,
    "95%_CI_above_centered": 0.2685296038877407,
    "99%_CI_above": 0.45631250674309937,
    "99%_CI_above_centered": 0.31036897429601096,
    "low_variance_datasets": 0.44566724043000827,
    "low_variance_datasets_centered": 0.3096503807916198,
    "model_uuid": "4cb03f7f-0d14-4454-829b-139eb2859c45",
    "_filename": "exp_data/evals/evaluation_mix_rw_v2_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1_wiki_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.3096503807916198,
    "Extended": 0.16264549085401253
}