{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "2e6d501e-50ab-4175-b7c7-3a9a262722e9",
    "model": "d=512_l=8_h=4",
    "creation_date": "2024_01_29-16_17_37",
    "eval_metrics": {
        "perplexity": 5.5417195955912275,
        "downstream_perpexity": {
            "mmlu": 5.785787924130758,
            "hellaswag": 4.429715617497762,
            "jeopardy_all": 6.679518993695577,
            "triviaqa_sm_sub": 6.449497540791829,
            "gsm8k": 5.136895374457041,
            "agi_eval_sat_math": 4.957894396781922,
            "aqua": 4.974226498117252,
            "svamp": 4.998908999760945,
            "bigbench_qa_wikidata": 9.138687256971995,
            "arc_easy": 5.66556080977122,
            "arc_challenge": 5.384217143058777,
            "bigbench_misconceptions": 5.269231689575056,
            "copa": 4.732700002193451,
            "siqa": 6.133848448594411,
            "commonsense_qa": 6.358402387301127,
            "piqa": 4.926943151156108,
            "openbook_qa": 6.530652176856995,
            "bigbench_novel_concepts": 5.435652673244476,
            "bigbench_strange_stories": 5.519271497068734,
            "bigbench_strategy_qa": 4.339672766129175,
            "lambada_openai": 6.675881046056747,
            "winograd_wsc": 4.234651560311789,
            "winogrande": 4.548615167538325,
            "bigbench_conlang_translation": 5.573342163388322,
            "bigbench_language_identification": 5.927461508909861,
            "bigbench_conceptual_combinations": 4.598793670969102,
            "bigbench_elementary_math_qa": 6.093258635203044,
            "bigbench_dyck_languages": 6.457230827387641,
            "agi_eval_lsat_ar": 5.964522208338199,
            "bigbench_cs_algorithms": 11.33228805065155,
            "bigbench_logical_deduction": 3.3907692869504293,
            "bigbench_operators": 6.619539980661301,
            "bigbench_repeat_copy_logic": 5.113778933882713,
            "simple_arithmetic_nospaces": 8.731563535558076,
            "simple_arithmetic_withspaces": 8.009204529413656,
            "math_qa": 6.270939540863037,
            "logi_qa": 5.645730947019867,
            "pubmed_qa_labeled": 9.765582406220316,
            "squad": 5.689604099591573,
            "agi_eval_lsat_rc": 6.3550084686991,
            "agi_eval_lsat_lr": 6.3356350515403,
            "coqa": 5.96228206952413,
            "bigbench_understanding_fables": 4.889510195091288,
            "boolq": 8.556489157676697,
            "agi_eval_sat_en": 6.075659397736336,
            "winogender_mc_female": 5.372383673985799,
            "winogender_mc_male": 4.65812441110611,
            "enterprise_pii_classification": 8.365177528063457,
            "bbq": 4.536579714218775,
            "human_eval_return_complex": 7.005977341509241,
            "human_eval_return_simple": 10.772175969304266,
            "human_eval-0.5": 7.007998277501362,
            "human_eval-0.25": 7.055022111753138,
            "human_eval-0.75": 7.0728785904442395,
            "human_eval": 7.1431823997962764,
            "processed_human_eval_cpp": 6.513865281336056,
            "processed_human_eval_js": 6.029464314623577
        },
        "icl": {
            "mmlu_zeroshot": 0.2319676664314772,
            "mmlu": 0.23698858184772625,
            "hellaswag_zeroshot": 0.24646484851837158,
            "jeopardy": 0.0005361930467188359,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0007581501267850399,
            "agi_eval_sat_math": 0.013636363670229912,
            "aqua": 0.0,
            "svamp": 0.036666665226221085,
            "bigbench_qa_wikidata": 0.0,
            "arc_easy": 0.2781986594200134,
            "arc_challenge": 0.19880546629428864,
            "bigbench_misconceptions": 0.5159817337989807,
            "copa": 0.4699999988079071,
            "siqa": 0.4728761613368988,
            "commonsense_qa": 0.21212121844291687,
            "piqa": 0.5353645086288452,
            "openbook_qa": 0.2240000069141388,
            "bigbench_novel_concepts": 0.28125,
            "bigbench_strange_stories": 0.4712643623352051,
            "bigbench_strategy_qa": 0.46876364946365356,
            "lambada_openai": 0.007762468419969082,
            "hellaswag": 0.24785898625850677,
            "winograd": 0.5128205418586731,
            "winogrande": 0.5138121843338013,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.24860000610351562,
            "bigbench_conceptual_combinations": 0.291262149810791,
            "bigbench_elementary_math_qa": 0.27143606543540955,
            "bigbench_dyck_languages": 0.007000000216066837,
            "agi_eval_lsat_ar": 0.208695650100708,
            "bigbench_cs_algorithms": 0.010606060735881329,
            "bigbench_logical_deduction": 0.25999999046325684,
            "bigbench_operators": 0.07619047909975052,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.003000000026077032,
            "simple_arithmetic_withspaces": 0.0020000000949949026,
            "math_qa": 0.24874287843704224,
            "logi_qa": 0.21966205537319183,
            "pubmed_qa_labeled": 0.0,
            "squad": 9.460737783228979e-05,
            "agi_eval_lsat_rc": 0.2798507511615753,
            "agi_eval_lsat_lr": 0.2705882489681244,
            "coqa": 0.01139922346919775,
            "bigbench_understanding_fables": 0.3174603283405304,
            "boolq": 0.5614678859710693,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.4333333373069763,
            "winogender_mc_male": 0.5333333611488342,
            "enterprise_pii_classification": 0.4842415452003479,
            "bbq": 0.4535744975913655
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.040379041184981666,
        "language understanding": 0.013326155215812225,
        "reading comprehension": 0.044948735456576586,
        "safety": -0.04775862937623804,
        "symbolic problem solving": 0.023889423705016573,
        "world knowledge": -0.004902480087610305
    },
    "aggregated_centered_results": 0.017986636318203488,
    "aggregated_results": 0.22885768143374477,
    "rw_small": 0.3781248405575752,
    "95%_CI_above": 0.2186459233569004,
    "99%_CI_above": 0.1937946139387204,
    "model_uuid": "3779b2c9-e9f4-4c5c-88c8-63b04ee073de",
    "low_variance_datasets": 0.20780904518264418
}