{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "2bab7a7f-703d-414b-a4a3-e2824c68aa6f",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-12_40_16",
    "eval_metrics": {
        "perplexity": 6.316096274058024,
        "downstream_perpexity": {
            "mmlu": 5.123593695958456,
            "hellaswag": 4.6440393368403114,
            "jeopardy_all": 7.033847860495249,
            "triviaqa_sm_sub": 6.852552926540374,
            "gsm8k": 5.752879110972087,
            "agi_eval_sat_math": 7.0624901294708256,
            "aqua": 5.661287817663076,
            "svamp": 5.761183864275615,
            "bigbench_qa_wikidata": 9.013067877292633,
            "arc_easy": 5.988477500279744,
            "arc_challenge": 5.703421072165171,
            "bigbench_misconceptions": 5.723295170422558,
            "copa": 4.997007746696472,
            "siqa": 5.141315641005834,
            "commonsense_qa": 5.393425977230072,
            "piqa": 5.087754209836324,
            "openbook_qa": 6.697205178260803,
            "bigbench_novel_concepts": 5.120003968477249,
            "bigbench_strange_stories": 5.818822189309131,
            "bigbench_strategy_qa": 4.992923080921173,
            "lambada_openai": 7.805609003702799,
            "winograd_wsc": 4.4251882671873215,
            "winogrande": 4.706089069445928,
            "bigbench_conlang_translation": 6.573982631287923,
            "bigbench_language_identification": 4.306227465470632,
            "bigbench_conceptual_combinations": 5.661752348964654,
            "bigbench_elementary_math_qa": 5.746531418959299,
            "bigbench_dyck_languages": 9.024753914159888,
            "agi_eval_lsat_ar": 4.691340912943301,
            "bigbench_cs_algorithms": 10.491372712453206,
            "bigbench_logical_deduction": 4.084069490432739,
            "bigbench_operators": 6.465863307317098,
            "bigbench_repeat_copy_logic": 5.998021379113197,
            "simple_arithmetic_nospaces": 7.756407132669657,
            "simple_arithmetic_withspaces": 7.219496899292249,
            "math_qa": 5.318486563364664,
            "logi_qa": 4.925434393084361,
            "pubmed_qa_labeled": 7.201659980942221,
            "squad": 6.723188662528992,
            "agi_eval_lsat_rc": 5.040912610381397,
            "agi_eval_lsat_lr": 5.116821451748119,
            "coqa": 6.67545702457428,
            "bigbench_understanding_fables": 5.156865170393041,
            "boolq": 8.316149441401164,
            "agi_eval_sat_en": 4.912556044106345,
            "winogender_mc_female": 6.515742381413777,
            "winogender_mc_male": 6.146933547655741,
            "enterprise_pii_classification": 8.919738181432088,
            "bbq": 6.090523612499237,
            "human_eval_return_complex": 7.497137955793246,
            "human_eval_return_simple": 9.446011826798722,
            "human_eval-0.5": 7.622327193981263,
            "human_eval-0.25": 7.57765417564206,
            "human_eval-0.75": 7.773609306754135,
            "human_eval": 7.552605067811361,
            "processed_human_eval_cpp": 7.751223558224506,
            "processed_human_eval_js": 7.599400037672462
        },
        "icl": {
            "mmlu_zeroshot": 0.23116857623844816,
            "mmlu": 0.23116857623844816,
            "hellaswag_zeroshot": 0.24138617515563965,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.0006666666595265269,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.0,
            "arc_easy": 0.2706229090690613,
            "arc_challenge": 0.1919795274734497,
            "bigbench_misconceptions": 0.47031962871551514,
            "copa": 0.47999998927116394,
            "siqa": 0.48413512110710144,
            "commonsense_qa": 0.1957411915063858,
            "piqa": 0.5386289358139038,
            "openbook_qa": 0.24799999594688416,
            "bigbench_novel_concepts": 0.28125,
            "bigbench_strange_stories": 0.5,
            "bigbench_strategy_qa": 0.4674530327320099,
            "lambada_openai": 0.0029109257739037275,
            "hellaswag": 0.2383987307548523,
            "winograd": 0.48351648449897766,
            "winogrande": 0.5082873106002808,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2502000033855438,
            "bigbench_conceptual_combinations": 0.3009708821773529,
            "bigbench_elementary_math_qa": 0.2670597434043884,
            "bigbench_dyck_languages": 0.0,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.0022727272007614374,
            "bigbench_logical_deduction": 0.2626666724681854,
            "bigbench_operators": 0.09047619253396988,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.007000000216066837,
            "math_qa": 0.24840764701366425,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.0,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.3776758313179016,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.4833333194255829,
            "winogender_mc_male": 0.5,
            "enterprise_pii_classification": 0.5063328146934509,
            "bbq": 0.4444835673679005
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.05338613564769427,
        "language understanding": 0.003474113075450683,
        "reading comprehension": -0.011450886726379393,
        "safety": -0.03292514925653284,
        "symbolic problem solving": 0.020327558766843546,
        "world knowledge": -0.019846827985445847
    },
    "aggregated_centered_results": 0.007406138364060383,
    "aggregated_results": 0.22264100976693982,
    "rw_small": 0.34499195714791614,
    "95%_CI_above": 0.20997463853916887,
    "99%_CI_above": 0.1840926680341363,
    "model_uuid": "eda945b4-cd91-470c-a90d-168c8033827d",
    "low_variance_datasets": 0.1975538131611591
}