{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "2e6b4e8f-316d-499f-9284-7f241cffe6a6",
    "model": "open_lm_1b",
    "creation_date": "2024_03_15-01_05_00",
    "eval_metrics": {
        "perplexity": 2.5618476033210755,
        "downstream_perpexity": {
            "mmlu": 1.6854342669779174,
            "hellaswag": 2.3636538272976186,
            "jeopardy_all": 1.3970444670503974,
            "triviaqa_sm_sub": 2.26259212906907,
            "gsm8k": 1.855400841465676,
            "agi_eval_sat_math": 1.5407524374398318,
            "aqua": 2.2372914606211136,
            "svamp": 2.7563262097040813,
            "bigbench_qa_wikidata": 3.214393653831307,
            "arc_easy": 2.2719950499358,
            "arc_challenge": 2.4305501150494955,
            "bigbench_misconceptions": 4.261877773014922,
            "copa": 2.496316976547241,
            "siqa": 1.2985553511455579,
            "commonsense_qa": 1.6491002672329778,
            "piqa": 2.6906359632853194,
            "openbook_qa": 4.13589733839035,
            "bigbench_novel_concepts": 2.3481713086366653,
            "bigbench_strange_stories": 3.163659012180635,
            "bigbench_strategy_qa": 1.820974796346412,
            "lambada_openai": 1.3414093119253319,
            "winograd_wsc": 2.4977947041228576,
            "winogrande": 3.0819723237285297,
            "bigbench_conlang_translation": 1.8642057623805068,
            "bigbench_language_identification": 3.5363955615544036,
            "bigbench_conceptual_combinations": 1.0452202246027085,
            "bigbench_elementary_math_qa": 4.442276725376052,
            "bigbench_dyck_languages": 4.550794855117798,
            "agi_eval_lsat_ar": 1.8576780915260316,
            "bigbench_cs_algorithms": 4.1446283571647875,
            "bigbench_logical_deduction": 1.0439437974294026,
            "bigbench_operators": 4.709181500616528,
            "bigbench_repeat_copy_logic": 1.4175149612128735,
            "simple_arithmetic_nospaces": 6.4665893015861515,
            "simple_arithmetic_withspaces": 6.44173193693161,
            "math_qa": 3.1574568287302416,
            "logi_qa": 2.0260927933518604,
            "pubmed_qa_labeled": 4.993588093757629,
            "squad": 2.2397506509564797,
            "agi_eval_lsat_rc": 1.848074862316473,
            "agi_eval_lsat_lr": 1.8854955528296677,
            "coqa": 2.848817262660885,
            "bigbench_understanding_fables": 3.9756218617555326,
            "boolq": 3.2003040500372557,
            "agi_eval_sat_en": 1.6910572908457042,
            "winogender_mc_female": 1.1492717434962592,
            "winogender_mc_male": 0.9280543218056361,
            "enterprise_pii_classification": 4.976567955326086,
            "bbq": 0.2777093152919951,
            "human_eval_return_complex": 2.3303949635798538,
            "human_eval_return_simple": 5.169031632913126,
            "human_eval-0.5": 2.2799775491400465,
            "human_eval-0.25": 2.3629336015480322,
            "human_eval-0.75": 2.3710071931524976,
            "human_eval": 2.487842939248899,
            "processed_human_eval_cpp": 2.5843854572461997,
            "processed_human_eval_js": 2.3058599662490007
        },
        "icl": {
            "mmlu_zeroshot": 0.2508939977800637,
            "hellaswag_zeroshot": 0.5633339881896973,
            "jeopardy": 0.30864564180374143,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.6125190854072571,
            "arc_easy": 0.6649831533432007,
            "arc_challenge": 0.35238906741142273,
            "bigbench_misconceptions": 0.4931506812572479,
            "copa": 0.7300000190734863,
            "siqa": 0.48464688658714294,
            "commonsense_qa": 0.2940213084220886,
            "piqa": 0.7317736744880676,
            "openbook_qa": 0.36800000071525574,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.5287356376647949,
            "bigbench_strategy_qa": 0.5190039277076721,
            "lambada_openai": 0.5984863042831421,
            "hellaswag": 0.5720971822738647,
            "winograd": 0.7692307829856873,
            "winogrande": 0.5777426958084106,
            "bigbench_conlang_translation": 0.018292682245373726,
            "bigbench_language_identification": 0.2590999901294708,
            "bigbench_conceptual_combinations": 0.25242719054222107,
            "bigbench_elementary_math_qa": 0.24030397832393646,
            "bigbench_dyck_languages": 0.1979999989271164,
            "agi_eval_lsat_ar": 0.23043477535247803,
            "bigbench_cs_algorithms": 0.4113636314868927,
            "bigbench_logical_deduction": 0.2513333261013031,
            "bigbench_operators": 0.20476190745830536,
            "bigbench_repeat_copy_logic": 0.0625,
            "simple_arithmetic_nospaces": 0.003000000026077032,
            "simple_arithmetic_withspaces": 0.003000000026077032,
            "math_qa": 0.2296345978975296,
            "logi_qa": 0.25960060954093933,
            "pubmed_qa_labeled": 0.38999998569488525,
            "squad": 0.3898770213127136,
            "agi_eval_lsat_rc": 0.2723880708217621,
            "agi_eval_lsat_lr": 0.25882354378700256,
            "coqa": 0.30351999402046204,
            "bigbench_understanding_fables": 0.22751322388648987,
            "boolq": 0.6064220070838928,
            "agi_eval_sat_en": 0.2669903039932251,
            "winogender_mc_female": 0.4166666567325592,
            "winogender_mc_male": 0.550000011920929,
            "enterprise_pii_classification": 0.5189985036849976,
            "bbq": 0.45123048804023047,
            "mmlu_fewshot": 0.24881509678405628,
            "gsm8k_cot": 0.006823351141065359,
            "agi_eval_sat_math_cot": 0.0181818176060915,
            "aqua_cot": 0.004081632476300001,
            "svamp_cot": 0.06333333253860474,
            "gpqa_main": 0.2008928507566452,
            "gpqa_diamond": 0.22727273404598236
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.23703039240469967,
        "language understanding": 0.29326653009771325,
        "reading comprehension": 0.15859065343740217,
        "safety": -0.031552169810641906,
        "symbolic problem solving": 0.07669555189504743,
        "world knowledge": 0.1501461956550419
    },
    "aggregated_centered_results": 0.15163749364626566,
    "aggregated_results": 0.3389730317531627,
    "rw_small": 0.5824784884850184,
    "rw_small_centered": 0.28122702467511274,
    "95%_CI_above": 0.4443343908949332,
    "95%_CI_above_centered": 0.25906669052370074,
    "99%_CI_above": 0.4547435099663942,
    "99%_CI_above_centered": 0.30214817345506084,
    "low_variance_datasets": 0.4458728286353024,
    "low_variance_datasets_centered": 0.30483361265418796,
    "model_uuid": "db2839d0-499f-4c60-90a5-bcd0cebb896f",
    "_filename": "exp_data/evals/evaluation_RW_v2_fasttext_length_OH_vs_unlabeled-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.30483361265418796,
    "Extended": 0.15163749364626566
}