{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "24990ad8-517b-41ef-94d7-7ed54edcb64c",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-12_58_16",
    "eval_metrics": {
        "perplexity": 5.351390059789022,
        "downstream_perpexity": {
            "mmlu": 5.326518821716308,
            "hellaswag": 4.132080195347468,
            "jeopardy_all": 6.479855247338613,
            "triviaqa_sm_sub": 6.332325422763825,
            "gsm8k": 4.566470583279927,
            "agi_eval_sat_math": 5.182917809486389,
            "aqua": 4.811354302386849,
            "svamp": 5.4408704996109005,
            "bigbench_qa_wikidata": 8.78937108516693,
            "arc_easy": 5.267230504751206,
            "arc_challenge": 5.056600642204285,
            "bigbench_misconceptions": 5.8737263679504395,
            "copa": 4.650416631698608,
            "siqa": 4.929720743497213,
            "commonsense_qa": 5.712803443272908,
            "piqa": 4.509179308017095,
            "openbook_qa": 6.312884196281433,
            "bigbench_novel_concepts": 4.830595701932907,
            "bigbench_strange_stories": 5.516165886802235,
            "bigbench_strategy_qa": 4.887023969491323,
            "lambada_openai": 5.603625476360321,
            "winograd_wsc": 4.10809401278094,
            "winogrande": 4.457747020324072,
            "bigbench_conlang_translation": 4.274029060107906,
            "bigbench_language_identification": 4.703491818904877,
            "bigbench_conceptual_combinations": 2.458128590028263,
            "bigbench_elementary_math_qa": 5.436363083124161,
            "bigbench_dyck_languages": 8.23165525608704,
            "agi_eval_lsat_ar": 5.2473109929457955,
            "bigbench_cs_algorithms": 7.530484890937805,
            "bigbench_logical_deduction": 1.4717616826295852,
            "bigbench_operators": 5.964246563684373,
            "bigbench_repeat_copy_logic": 3.49418842792511,
            "simple_arithmetic_nospaces": 7.885337743939472,
            "simple_arithmetic_withspaces": 7.436169352851996,
            "math_qa": 4.780198514461517,
            "logi_qa": 5.232669674672656,
            "pubmed_qa_labeled": 9.092808139424363,
            "squad": 4.602629985411962,
            "agi_eval_lsat_rc": 5.508591878770003,
            "agi_eval_lsat_lr": 5.444544985715081,
            "coqa": 5.610128480195999,
            "bigbench_understanding_fables": 4.833262660516002,
            "boolq": 7.215410657723745,
            "agi_eval_sat_en": 6.1217855967364265,
            "winogender_mc_female": 4.82968962987264,
            "winogender_mc_male": 3.8345304489135743,
            "enterprise_pii_classification": 8.964718302090963,
            "bbq": 2.3056913326183954,
            "human_eval_return_complex": 5.8499304741386355,
            "human_eval_return_simple": 8.167084229959023,
            "human_eval-0.5": 5.942447563497032,
            "human_eval-0.25": 6.011526197921939,
            "human_eval-0.75": 6.018526615166083,
            "human_eval": 6.0044731047095325,
            "processed_human_eval_cpp": 6.434374202112234,
            "processed_human_eval_js": 6.224536631165481
        },
        "icl": {
            "mmlu_zeroshot": 0.23163139950810818,
            "mmlu": 0.2336820383605204,
            "hellaswag_zeroshot": 0.2558255195617676,
            "jeopardy": 0.0005361930467188359,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.004081632476300001,
            "svamp": 0.006666666828095913,
            "bigbench_qa_wikidata": 0.028591113165020943,
            "arc_easy": 0.2857744097709656,
            "arc_challenge": 0.19880546629428864,
            "bigbench_misconceptions": 0.47488585114479065,
            "copa": 0.4399999976158142,
            "siqa": 0.5,
            "commonsense_qa": 0.2555282413959503,
            "piqa": 0.5636561512947083,
            "openbook_qa": 0.23000000417232513,
            "bigbench_novel_concepts": 0.21875,
            "bigbench_strange_stories": 0.4367816150188446,
            "bigbench_strategy_qa": 0.4914810061454773,
            "lambada_openai": 0.05278478562831879,
            "hellaswag": 0.2543317973613739,
            "winograd": 0.523809552192688,
            "winogrande": 0.507498025894165,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.25189998745918274,
            "bigbench_conceptual_combinations": 0.223300963640213,
            "bigbench_elementary_math_qa": 0.26912999153137207,
            "bigbench_dyck_languages": 0.023000000044703484,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.040909089148044586,
            "bigbench_logical_deduction": 0.25,
            "bigbench_operators": 0.07619047909975052,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.004999999888241291,
            "simple_arithmetic_withspaces": 0.007000000216066837,
            "math_qa": 0.2463962435722351,
            "logi_qa": 0.20430107414722443,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.0005676442524418235,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.28431373834609985,
            "coqa": 0.005762244574725628,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.38042813539505005,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.44999998807907104,
            "winogender_mc_male": 0.550000011920929,
            "enterprise_pii_classification": 0.5030927658081055,
            "bbq": 0.437225493517789
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.045359400411446885,
        "language understanding": 0.011984704062342644,
        "reading comprehension": -0.008991182505269533,
        "safety": -0.029840870337052766,
        "symbolic problem solving": 0.023550001800888117,
        "world knowledge": -0.010947071615467324
    },
    "aggregated_centered_results": 0.010453595068422198,
    "aggregated_results": 0.22492028649805737,
    "rw_small": 0.34472192575534183,
    "95%_CI_above": 0.2144967926412849,
    "99%_CI_above": 0.19439380967151373,
    "model_uuid": "33d0a2c8-7db2-41d9-ad67-3a5369e24126",
    "low_variance_datasets": 0.2091811725732193
}