{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "befafde1-ab4c-4b20-b9f7-b64c06f86e78",
    "model": "open_lm_1b",
    "creation_date": "2024_01_29-13_45_40",
    "eval_metrics": {
        "perplexity": 2.2384712636470794,
        "downstream_perpexity": {
            "mmlu": 2.7440373167711827,
            "hellaswag": 2.3660058031707045,
            "jeopardy_all": 2.390094904838782,
            "triviaqa_sm_sub": 2.6577736510833105,
            "gsm8k": 1.9873664130578899,
            "agi_eval_sat_math": 1.678891425782984,
            "aqua": 2.591660991006968,
            "svamp": 2.841634438832601,
            "bigbench_qa_wikidata": 3.1156243966218375,
            "arc_easy": 2.596893537726868,
            "arc_challenge": 2.68506431940676,
            "bigbench_misconceptions": 5.654808022660207,
            "copa": 2.607797778844833,
            "siqa": 2.3172131580364494,
            "commonsense_qa": 3.4469294481644566,
            "piqa": 2.5763731686412576,
            "openbook_qa": 4.295614561080932,
            "bigbench_novel_concepts": 2.739128038287163,
            "bigbench_strange_stories": 3.4336154351289245,
            "bigbench_strategy_qa": 2.035351710425761,
            "lambada_openai": 1.6387746365379066,
            "winograd_wsc": 2.582246822096926,
            "winogrande": 3.1149315143440686,
            "bigbench_conlang_translation": 1.9945612171801126,
            "bigbench_language_identification": 3.2888981700778173,
            "bigbench_conceptual_combinations": 0.6184260764168305,
            "bigbench_elementary_math_qa": 2.420432589260912,
            "bigbench_dyck_languages": 2.532236327767372,
            "agi_eval_lsat_ar": 1.8821743887403737,
            "bigbench_cs_algorithms": 5.66489996693351,
            "bigbench_logical_deduction": 0.7341694225470226,
            "bigbench_operators": 5.205815629732041,
            "bigbench_repeat_copy_logic": 1.5339310579001904,
            "simple_arithmetic_nospaces": 6.85688458442688,
            "simple_arithmetic_withspaces": 5.722272901058197,
            "math_qa": 4.041607016330992,
            "logi_qa": 1.987965187535674,
            "pubmed_qa_labeled": 6.749374970436096,
            "squad": 2.100608573821545,
            "agi_eval_lsat_rc": 2.1653498095362935,
            "agi_eval_lsat_lr": 2.154129633249021,
            "coqa": 1.727476010716202,
            "bigbench_understanding_fables": 3.5840040968839455,
            "boolq": 2.8920594007597056,
            "agi_eval_sat_en": 2.1486576031712654,
            "winogender_mc_female": 1.2977520724137623,
            "winogender_mc_male": 1.1011890073617299,
            "enterprise_pii_classification": 3.8527190167703753,
            "bbq": 0.3249649057470622,
            "human_eval_return_complex": 0.8971446771321334,
            "human_eval_return_simple": 3.8880007299217016,
            "human_eval-0.5": 0.7856876005123301,
            "human_eval-0.25": 0.8276843741172697,
            "human_eval-0.75": 0.8059736890036885,
            "human_eval": 0.8944523985066065,
            "processed_human_eval_cpp": 0.8012924312804797,
            "processed_human_eval_js": 0.7941058716032563
        },
        "icl": {
            "mmlu_zeroshot": 0.2311794833655943,
            "hellaswag_zeroshot": 0.5407289266586304,
            "jeopardy": 0.046110181324183944,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.00909090880304575,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.6007578372955322,
            "arc_easy": 0.5888047218322754,
            "arc_challenge": 0.2977815568447113,
            "bigbench_misconceptions": 0.45205479860305786,
            "copa": 0.699999988079071,
            "siqa": 0.4948822855949402,
            "commonsense_qa": 0.2522522509098053,
            "piqa": 0.7366703152656555,
            "openbook_qa": 0.3619999885559082,
            "bigbench_novel_concepts": 0.46875,
            "bigbench_strange_stories": 0.48275861144065857,
            "bigbench_strategy_qa": 0.520751416683197,
            "lambada_openai": 0.4944692552089691,
            "hellaswag": 0.5482971668243408,
            "winograd": 0.7032967209815979,
            "winogrande": 0.5706393122673035,
            "bigbench_conlang_translation": 0.018292682245373726,
            "bigbench_language_identification": 0.25519999861717224,
            "bigbench_conceptual_combinations": 0.25242719054222107,
            "bigbench_elementary_math_qa": 0.24310795962810516,
            "bigbench_dyck_languages": 0.3630000054836273,
            "agi_eval_lsat_ar": 0.1608695685863495,
            "bigbench_cs_algorithms": 0.42500001192092896,
            "bigbench_logical_deduction": 0.2446666657924652,
            "bigbench_operators": 0.18095238506793976,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0010000000474974513,
            "simple_arithmetic_withspaces": 0.003000000026077032,
            "math_qa": 0.24773718416690826,
            "logi_qa": 0.2611367106437683,
            "pubmed_qa_labeled": 0.5360000133514404,
            "squad": 0.3235572278499603,
            "agi_eval_lsat_rc": 0.26865673065185547,
            "agi_eval_lsat_lr": 0.250980406999588,
            "coqa": 0.26105475425720215,
            "bigbench_understanding_fables": 0.26455026865005493,
            "boolq": 0.582874596118927,
            "agi_eval_sat_en": 0.2864077687263489,
            "winogender_mc_female": 0.4000000059604645,
            "winogender_mc_male": 0.550000011920929,
            "enterprise_pii_classification": 0.5723122358322144,
            "bbq": 0.47985441847281024,
            "mmlu_fewshot": 0.242903827836639,
            "gsm8k_cot": 0.011372251436114311,
            "agi_eval_sat_math_cot": 0.013636363670229912,
            "aqua_cot": 0.004081632476300001,
            "svamp_cot": 0.05666666850447655,
            "gpqa_main": 0.2299107164144516,
            "gpqa_diamond": 0.22727273404598236
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.20920610900033731,
        "language understanding": 0.25373455561719,
        "reading comprehension": 0.16271143921307826,
        "safety": 0.0010833360932090141,
        "symbolic problem solving": 0.07780296993877772,
        "world knowledge": 0.09751150029418004
    },
    "aggregated_centered_results": 0.13493791937813485,
    "aggregated_results": 0.3266226631511249,
    "rw_small": 0.5468048503001531,
    "95%_CI_above": 0.42251245560209183,
    "99%_CI_above": 0.4281490878485467,
    "model_uuid": "310cd6c7-6409-46bf-9cc1-9aea7f41c476",
    "low_variance_datasets": 0.40883258045227694,
    "_filename": "exp_data/evals/evaluation_dolma_v1_no_resample-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124_heavy.json",
    "missing tasks": "[]",
    "rw_small_centered": 0.2244445183123761,
    "95%_CI_above_centered": 0.23192272878939285,
    "99%_CI_above_centered": 0.26806923800015225,
    "low_variance_datasets_centered": 0.25752754778512504,
    "Core": 0.25752754778512504,
    "Extended": 0.13493791937813485
}