{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "97cc4fa6-bdc0-4b27-a328-bf05850ca905",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-13_45_38",
    "eval_metrics": {
        "perplexity": 6.57087888320287,
        "downstream_perpexity": {
            "mmlu": 5.38398509422938,
            "hellaswag": 4.865634326140086,
            "jeopardy_all": 7.365468394756317,
            "triviaqa_sm_sub": 7.023566921552022,
            "gsm8k": 6.113350443045298,
            "agi_eval_sat_math": 7.449018164114519,
            "aqua": 5.914759036472866,
            "svamp": 5.763761135737101,
            "bigbench_qa_wikidata": 9.23016414642334,
            "arc_easy": 6.172914576530457,
            "arc_challenge": 5.875411558151245,
            "bigbench_misconceptions": 5.534621578373321,
            "copa": 5.2432542991638185,
            "siqa": 5.208015441894531,
            "commonsense_qa": 5.744494378566742,
            "piqa": 5.259427189826965,
            "openbook_qa": 6.932128251075745,
            "bigbench_novel_concepts": 5.205585777759552,
            "bigbench_strange_stories": 6.058499056717445,
            "bigbench_strategy_qa": 5.011242083708445,
            "lambada_openai": 8.16113675435384,
            "winograd_wsc": 4.594949148513458,
            "winogrande": 4.884976603587469,
            "bigbench_conlang_translation": 6.974088046608902,
            "bigbench_language_identification": 5.153419423103332,
            "bigbench_conceptual_combinations": 6.103486061096191,
            "bigbench_elementary_math_qa": 6.723032208283742,
            "bigbench_dyck_languages": 9.511093764745889,
            "agi_eval_lsat_ar": 5.045975328528363,
            "bigbench_cs_algorithms": 12.318805964787801,
            "bigbench_logical_deduction": 4.5188581069310505,
            "bigbench_operators": 6.361913583392188,
            "bigbench_repeat_copy_logic": 6.533355206251144,
            "simple_arithmetic_nospaces": 7.629722005178948,
            "simple_arithmetic_withspaces": 7.1818691772573136,
            "math_qa": 5.739543354511261,
            "logi_qa": 5.315788303835235,
            "pubmed_qa_labeled": 8.25851341005133,
            "squad": 7.052402091026306,
            "agi_eval_lsat_rc": 5.156967962855723,
            "agi_eval_lsat_lr": 5.223898048961864,
            "coqa": 6.994150078296661,
            "bigbench_understanding_fables": 5.85041598920469,
            "boolq": 9.250418114662171,
            "agi_eval_sat_en": 4.987219616047387,
            "winogender_mc_female": 7.317125209172567,
            "winogender_mc_male": 6.893816081682841,
            "enterprise_pii_classification": 9.545952002207438,
            "bbq": 6.622781729698181,
            "human_eval_return_complex": 7.939603873125211,
            "human_eval_return_simple": 9.950749912777463,
            "human_eval-0.5": 8.065801338451665,
            "human_eval-0.25": 7.993996355591751,
            "human_eval-0.75": 8.203669443363097,
            "human_eval": 7.933427165194256,
            "processed_human_eval_cpp": 8.148682084883221,
            "processed_human_eval_js": 8.077174896147193
        },
        "icl": {
            "mmlu_zeroshot": 0.23116857623844816,
            "mmlu": 0.23116857623844816,
            "hellaswag_zeroshot": 0.23919537663459778,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.0,
            "arc_easy": 0.2777777910232544,
            "arc_challenge": 0.20648464560508728,
            "bigbench_misconceptions": 0.4840182662010193,
            "copa": 0.5199999809265137,
            "siqa": 0.48413512110710144,
            "commonsense_qa": 0.1957411915063858,
            "piqa": 0.5435255765914917,
            "openbook_qa": 0.25200000405311584,
            "bigbench_novel_concepts": 0.25,
            "bigbench_strange_stories": 0.49425286054611206,
            "bigbench_strategy_qa": 0.4674530327320099,
            "lambada_openai": 0.000776246830355376,
            "hellaswag": 0.2390957921743393,
            "winograd": 0.5201465487480164,
            "winogrande": 0.494080513715744,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2502000033855438,
            "bigbench_conceptual_combinations": 0.3106796145439148,
            "bigbench_elementary_math_qa": 0.2653825879096985,
            "bigbench_dyck_languages": 0.0,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.0,
            "bigbench_logical_deduction": 0.2606666684150696,
            "bigbench_operators": 0.08095238357782364,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.006000000052154064,
            "math_qa": 0.24840764701366425,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.0,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.470336377620697,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.4833333194255829,
            "winogender_mc_male": 0.5166666507720947,
            "enterprise_pii_classification": 0.5163475871086121,
            "bbq": 0.46013046394694934
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.05363184461990992,
        "language understanding": 0.010182260926133797,
        "reading comprehension": 0.011714249849319458,
        "safety": -0.011760989373380493,
        "symbolic problem solving": 0.01914759771898389,
        "world knowledge": -0.012853835265985103
    },
    "aggregated_centered_results": 0.014540513124044332,
    "aggregated_results": 0.2264438246534749,
    "rw_small": 0.3679791142543157,
    "95%_CI_above": 0.21578583532087084,
    "99%_CI_above": 0.1913018790143542,
    "model_uuid": "6125019e-4169-462b-91a4-8e819c5fe94d",
    "low_variance_datasets": 0.20529088143799032
}