{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "702955d8-6571-4220-973e-316e6a298e1a",
    "model": "d=512_l=8_h=4",
    "creation_date": "2024_01_29-12_17_39",
    "eval_metrics": {
        "perplexity": 3.564403438568115,
        "downstream_perpexity": {
            "mmlu": 5.212295913696289,
            "hellaswag": 3.7128775914510093,
            "jeopardy_all": 5.209237670898437,
            "triviaqa_sm_sub": 5.273501475652059,
            "gsm8k": 3.696703386306763,
            "agi_eval_sat_math": 2.723585013990049,
            "aqua": 3.962138545128607,
            "svamp": 4.228588980597419,
            "bigbench_qa_wikidata": 7.005190340677897,
            "arc_easy": 4.557654651006063,
            "arc_challenge": 4.443177286783854,
            "bigbench_misconceptions": 5.267624890362775,
            "copa": 4.684292316436768,
            "siqa": 5.221746985117594,
            "commonsense_qa": 5.420291646321615,
            "piqa": 4.1686996142069495,
            "openbook_qa": 5.6118187365993375,
            "bigbench_novel_concepts": 4.745449066162109,
            "bigbench_strange_stories": 3.9503624655983667,
            "bigbench_strategy_qa": 4.0377544403076175,
            "lambada_openai": 3.439079014460246,
            "winograd_wsc": 3.5272348207585953,
            "winogrande": 3.9112638155619304,
            "bigbench_conlang_translation": 3.0994609355926515,
            "bigbench_language_identification": 4.778064028422038,
            "bigbench_conceptual_combinations": 1.2022187801507802,
            "bigbench_elementary_math_qa": 5.141394329071045,
            "bigbench_dyck_languages": 6.309648293407023,
            "agi_eval_lsat_ar": 5.844467590595114,
            "bigbench_cs_algorithms": 5.7523146311442055,
            "bigbench_logical_deduction": 1.0306397080421448,
            "bigbench_operators": 5.78896243755634,
            "bigbench_repeat_copy_logic": 2.238411158323288,
            "simple_arithmetic_nospaces": 7.275410083161683,
            "simple_arithmetic_withspaces": 6.315796259070645,
            "math_qa": 5.047430229187012,
            "logi_qa": 5.634036340831239,
            "pubmed_qa_labeled": 8.470876517416048,
            "squad": 3.5324408213297525,
            "agi_eval_lsat_rc": 5.614487792506362,
            "agi_eval_lsat_lr": 5.849684357643127,
            "coqa": 4.073408873875936,
            "bigbench_understanding_fables": 4.73647157351176,
            "boolq": 5.653476365407308,
            "agi_eval_sat_en": 5.909370092245249,
            "winogender_mc_female": 4.25230462210519,
            "winogender_mc_male": 3.3953235489981517,
            "enterprise_pii_classification": 4.9005817095438635,
            "bbq": 0.8215120712916056,
            "human_eval_return_complex": 2.4156265258789062,
            "human_eval_return_simple": 4.019250106811524,
            "human_eval-0.5": 2.3199689626693725,
            "human_eval-0.25": 2.2412492275238036,
            "human_eval-0.75": 2.348432517051697,
            "human_eval": 2.3367814779281617,
            "processed_human_eval_cpp": 1.9139333605766295,
            "processed_human_eval_js": 1.7712652206420898
        },
        "icl": {
            "mmlu_zeroshot": 0.2314726712933758,
            "mmlu": 0.2408022974666796,
            "hellaswag_zeroshot": 0.2648874819278717,
            "jeopardy": 0.0005361930467188359,
            "triviaqa_sm_sub": 0.0006666666595265269,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0533333346247673,
            "bigbench_qa_wikidata": 0.08533044904470444,
            "arc_easy": 0.3211279511451721,
            "arc_challenge": 0.22610922157764435,
            "bigbench_misconceptions": 0.4885844886302948,
            "copa": 0.5099999904632568,
            "siqa": 0.5025588274002075,
            "commonsense_qa": 0.32350531220436096,
            "piqa": 0.5554950833320618,
            "openbook_qa": 0.2460000067949295,
            "bigbench_novel_concepts": 0.34375,
            "bigbench_strange_stories": 0.44252872467041016,
            "bigbench_strategy_qa": 0.5024027824401855,
            "lambada_openai": 0.16902774572372437,
            "hellaswag": 0.2602071166038513,
            "winograd": 0.4871794879436493,
            "winogrande": 0.5122336149215698,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2547999918460846,
            "bigbench_conceptual_combinations": 0.3203883469104767,
            "bigbench_elementary_math_qa": 0.26781970262527466,
            "bigbench_dyck_languages": 0.07000000029802322,
            "agi_eval_lsat_ar": 0.16956521570682526,
            "bigbench_cs_algorithms": 0.4007575809955597,
            "bigbench_logical_deduction": 0.24266666173934937,
            "bigbench_operators": 0.05714285746216774,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0020000000949949026,
            "simple_arithmetic_withspaces": 0.003000000026077032,
            "math_qa": 0.24740193784236908,
            "logi_qa": 0.2150537669658661,
            "pubmed_qa_labeled": 0.36899998784065247,
            "squad": 0.001986755058169365,
            "agi_eval_lsat_rc": 0.2537313401699066,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0404609814286232,
            "bigbench_understanding_fables": 0.2698412835597992,
            "boolq": 0.41437309980392456,
            "agi_eval_sat_en": 0.2572815418243408,
            "winogender_mc_female": 0.550000011920929,
            "winogender_mc_male": 0.5333333611488342,
            "enterprise_pii_classification": 0.5057437419891357,
            "bbq": 0.4714467227458954
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.09170557310183843,
        "language understanding": 0.037695566813151046,
        "reading comprehension": 0.03990516571017603,
        "safety": 0.030261918902397156,
        "symbolic problem solving": 0.04828448016713891,
        "world knowledge": 0.011214809331921068
    },
    "aggregated_centered_results": 0.044891839172026764,
    "aggregated_results": 0.25428453290343733,
    "rw_small": 0.3747885599732399,
    "95%_CI_above": 0.26323482763332623,
    "99%_CI_above": 0.24176012839749453,
    "model_uuid": "fdecd6f8-1425-4783-b08a-54362a0b52a6",
    "low_variance_datasets": 0.2441239153331315
}