{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "9313b829-fdbc-49e8-bee7-56c5e3a14e66",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-22_12_40",
    "eval_metrics": {
        "perplexity": 5.543116986751556,
        "downstream_perpexity": {
            "mmlu": 5.15352490345637,
            "hellaswag": 5.197704601287842,
            "jeopardy_all": 7.333162669340769,
            "triviaqa_sm_sub": 6.957518744468689,
            "gsm8k": 6.2300267259279885,
            "agi_eval_sat_math": 4.571010624278675,
            "aqua": 5.606185198803337,
            "svamp": 5.392368618647257,
            "bigbench_qa_wikidata": 9.716427059968312,
            "arc_easy": 6.494885408878327,
            "arc_challenge": 6.046290822823843,
            "bigbench_misconceptions": 6.280183495996205,
            "copa": 5.2228590726852415,
            "siqa": 4.850850184758504,
            "commonsense_qa": 5.288508864243825,
            "piqa": 5.737847749392191,
            "openbook_qa": 7.158998568534851,
            "bigbench_novel_concepts": 5.587191611528397,
            "bigbench_strange_stories": 6.501971420200392,
            "bigbench_strategy_qa": 4.863563628991445,
            "lambada_openai": 8.136037747065226,
            "winograd_wsc": 4.643078741136488,
            "winogrande": 5.05129029750824,
            "bigbench_conlang_translation": 6.7464269952076235,
            "bigbench_language_identification": 4.702198509375254,
            "bigbench_conceptual_combinations": 6.140430274518948,
            "bigbench_elementary_math_qa": 6.461964317162832,
            "bigbench_dyck_languages": 6.626224394605941,
            "agi_eval_lsat_ar": 4.990147290022477,
            "bigbench_cs_algorithms": 8.950489906469981,
            "bigbench_logical_deduction": 4.577642699082692,
            "bigbench_operators": 6.212689867473784,
            "bigbench_repeat_copy_logic": 6.6481359004974365,
            "simple_arithmetic_nospaces": 7.201236097752547,
            "simple_arithmetic_withspaces": 7.210588788785854,
            "math_qa": 4.445755902926127,
            "logi_qa": 5.039451903828095,
            "pubmed_qa_labeled": 10.022411842306122,
            "squad": 7.050077172120412,
            "agi_eval_lsat_rc": 5.14594466116891,
            "agi_eval_lsat_lr": 5.110351634493061,
            "coqa": 7.095361332098643,
            "bigbench_understanding_fables": 4.836274351392474,
            "boolq": 9.065519833564759,
            "agi_eval_sat_en": 5.083889576995257,
            "winogender_mc_female": 7.599185458819071,
            "winogender_mc_male": 7.449526429176331,
            "enterprise_pii_classification": 8.209705193837484,
            "bbq": 6.47286608616511,
            "human_eval_return_complex": 4.560501429039662,
            "human_eval_return_simple": 5.5826640129089355,
            "human_eval-0.5": 4.549747664753983,
            "human_eval-0.25": 4.525493278736022,
            "human_eval-0.75": 4.603175659005235,
            "human_eval": 4.546614966741422,
            "processed_human_eval_cpp": 4.295519929494917,
            "processed_human_eval_js": 3.472212957172859
        },
        "icl": {
            "mmlu_zeroshot": 0.23116857623844816,
            "mmlu": 0.23116857623844816,
            "hellaswag_zeroshot": 0.23919537663459778,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.0006666666595265269,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.0,
            "arc_easy": 0.26052188873291016,
            "arc_challenge": 0.20051194727420807,
            "bigbench_misconceptions": 0.456620991230011,
            "copa": 0.4699999988079071,
            "siqa": 0.48413512110710144,
            "commonsense_qa": 0.1957411915063858,
            "piqa": 0.5299238562583923,
            "openbook_qa": 0.25600001215934753,
            "bigbench_novel_concepts": 0.21875,
            "bigbench_strange_stories": 0.517241358757019,
            "bigbench_strategy_qa": 0.4674530327320099,
            "lambada_openai": 0.000776246830355376,
            "hellaswag": 0.24078868329524994,
            "winograd": 0.5384615659713745,
            "winogrande": 0.5043409466743469,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2502000033855438,
            "bigbench_conceptual_combinations": 0.3106796145439148,
            "bigbench_elementary_math_qa": 0.2662735879421234,
            "bigbench_dyck_languages": 0.0,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.0,
            "bigbench_logical_deduction": 0.2639999985694885,
            "bigbench_operators": 0.095238097012043,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.004999999888241291,
            "math_qa": 0.24840764701366425,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.0,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.38134557008743286,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.44999998807907104,
            "winogender_mc_male": 0.5333333611488342,
            "enterprise_pii_classification": 0.5063328146934509,
            "bbq": 0.4560995399951935
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.04518687600890796,
        "language understanding": 0.017608271991775837,
        "reading comprehension": -0.01053345203399658,
        "safety": -0.02711714804172516,
        "symbolic problem solving": 0.020408807953612672,
        "world knowledge": -0.023532920779387345
    },
    "aggregated_centered_results": 0.008382229973248792,
    "aggregated_results": 0.22271675644745592,
    "rw_small": 0.34265262136856717,
    "95%_CI_above": 0.21003374384987789,
    "99%_CI_above": 0.18581060620723291,
    "model_uuid": "24419526-0d05-4b07-a0ec-466d61aa4d28",
    "low_variance_datasets": 0.1995060156305871
}