{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "5c71ed01-3fc3-44df-adb9-dfc14d8a193c",
    "model": "d=1024_l=24_h=8",
    "creation_date": "2024_01_29-21_58_32",
    "eval_metrics": {
        "perplexity": 4.257029650608699,
        "downstream_perpexity": {
            "mmlu": 5.2633670727411905,
            "hellaswag": 3.2036101003487905,
            "jeopardy_all": 4.891584998369217,
            "triviaqa_sm_sub": 5.098835011323293,
            "gsm8k": 3.6604472001393638,
            "agi_eval_sat_math": 4.0540619828484274,
            "aqua": 3.916470278525839,
            "svamp": 4.667147844632467,
            "bigbench_qa_wikidata": 6.386074042320251,
            "arc_easy": 4.237316222985585,
            "arc_challenge": 4.107224599520365,
            "bigbench_misconceptions": 6.538437529785992,
            "copa": 3.583229353427887,
            "siqa": 5.023778613408407,
            "commonsense_qa": 5.02401340007782,
            "piqa": 3.5091388881206513,
            "openbook_qa": 5.456998148918152,
            "bigbench_novel_concepts": 4.184283271431923,
            "bigbench_strange_stories": 4.52154113780493,
            "bigbench_strategy_qa": 2.1263336996237436,
            "lambada_openai": 3.699370966355006,
            "winograd_wsc": 3.342299398048457,
            "winogrande": 3.699420328934987,
            "bigbench_conlang_translation": 3.1615454481869207,
            "bigbench_language_identification": 4.691337730487188,
            "bigbench_conceptual_combinations": 1.2010229316729943,
            "bigbench_elementary_math_qa": 5.0501674552758535,
            "bigbench_dyck_languages": 6.981960768459224,
            "agi_eval_lsat_ar": 5.171949164763741,
            "bigbench_cs_algorithms": 6.717364696661631,
            "bigbench_logical_deduction": 0.9296851525704066,
            "bigbench_operators": 6.532752127874465,
            "bigbench_repeat_copy_logic": 2.747652731835842,
            "simple_arithmetic_nospaces": 7.432650245538278,
            "simple_arithmetic_withspaces": 7.239711116341984,
            "math_qa": 4.835359553496043,
            "logi_qa": 5.4132746677794215,
            "pubmed_qa_labeled": 7.369622148886449,
            "squad": 3.6042007098595303,
            "agi_eval_lsat_rc": 5.944860561570125,
            "agi_eval_lsat_lr": 6.025580163095512,
            "coqa": 4.019305074214936,
            "bigbench_understanding_fables": 5.598919109061912,
            "boolq": 4.464268459876378,
            "agi_eval_sat_en": 5.876796002526885,
            "winogender_mc_female": 4.626427416006724,
            "winogender_mc_male": 4.063952728112539,
            "enterprise_pii_classification": 3.6443660219510394,
            "bbq": 0.8390809128681819,
            "human_eval_return_complex": 4.948613204355315,
            "human_eval_return_simple": 7.956711176279429,
            "human_eval-0.5": 4.881902344343139,
            "human_eval-0.25": 4.975432542766013,
            "human_eval-0.75": 5.048929963170028,
            "human_eval": 5.06963963043399,
            "processed_human_eval_cpp": 5.155454668199053,
            "processed_human_eval_js": 5.1856498863638905
        },
        "icl": {
            "mmlu_zeroshot": 0.23110839776825487,
            "mmlu": 0.23942957870792925,
            "hellaswag_zeroshot": 0.28251343965530396,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.006666666828095913,
            "bigbench_qa_wikidata": 0.14320161938667297,
            "arc_easy": 0.341329962015152,
            "arc_challenge": 0.22440272569656372,
            "bigbench_misconceptions": 0.49771690368652344,
            "copa": 0.550000011920929,
            "siqa": 0.4984646737575531,
            "commonsense_qa": 0.2694512605667114,
            "piqa": 0.6050054430961609,
            "openbook_qa": 0.27399998903274536,
            "bigbench_novel_concepts": 0.3125,
            "bigbench_strange_stories": 0.4482758641242981,
            "bigbench_strategy_qa": 0.496723473072052,
            "lambada_openai": 0.16902774572372437,
            "hellaswag": 0.28131845593452454,
            "winograd": 0.5384615659713745,
            "winogrande": 0.5209155678749084,
            "bigbench_conlang_translation": 0.006097560748457909,
            "bigbench_language_identification": 0.2556999921798706,
            "bigbench_conceptual_combinations": 0.291262149810791,
            "bigbench_elementary_math_qa": 0.2740303874015808,
            "bigbench_dyck_languages": 0.054999999701976776,
            "agi_eval_lsat_ar": 0.20000000298023224,
            "bigbench_cs_algorithms": 0.3492424190044403,
            "bigbench_logical_deduction": 0.23933333158493042,
            "bigbench_operators": 0.0476190485060215,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0020000000949949026,
            "simple_arithmetic_withspaces": 0.0,
            "math_qa": 0.2504190504550934,
            "logi_qa": 0.22580644488334656,
            "pubmed_qa_labeled": 0.1340000033378601,
            "squad": 0.0014191106893122196,
            "agi_eval_lsat_rc": 0.302238792181015,
            "agi_eval_lsat_lr": 0.2647058963775635,
            "coqa": 0.04208943992853165,
            "bigbench_understanding_fables": 0.26455026865005493,
            "boolq": 0.49143731594085693,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.4833333194255829,
            "winogender_mc_male": 0.4833333194255829,
            "enterprise_pii_classification": 0.4512518346309662,
            "bbq": 0.44257809628139844
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.09852590660254161,
        "language understanding": 0.05520061970067521,
        "reading comprehension": 0.042125538127341614,
        "safety": -0.0697517151182348,
        "symbolic problem solving": 0.04341283626337018,
        "world knowledge": 0.02341620570916869
    },
    "aggregated_centered_results": 0.04169278126657707,
    "aggregated_results": 0.2508590352735015,
    "rw_small": 0.401766503850619,
    "95%_CI_above": 0.2582407930823551,
    "99%_CI_above": 0.2391341451369226,
    "model_uuid": "df104fdb-2b70-4495-ad02-be1d8e58c351",
    "low_variance_datasets": 0.2564606870820915
}