{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "92a32b48-5ea1-453e-83eb-d9a592778df1",
    "model": "d=512_l=8_h=4",
    "creation_date": "2024_01_29-19_46_05",
    "eval_metrics": {
        "perplexity": 5.0461796482404075,
        "downstream_perpexity": {
            "mmlu": 4.819797631104787,
            "hellaswag": 4.816861176490784,
            "jeopardy_all": 6.62906094789505,
            "triviaqa_sm_sub": 6.612497468789418,
            "gsm8k": 5.288113717238108,
            "agi_eval_sat_math": 3.982070010358637,
            "aqua": 5.066247626713344,
            "svamp": 5.00629569530487,
            "bigbench_qa_wikidata": 9.135739648342133,
            "arc_easy": 5.980279902617137,
            "arc_challenge": 5.593016505241394,
            "bigbench_misconceptions": 5.920486145367906,
            "copa": 4.933412218093872,
            "siqa": 4.2132229248682656,
            "commonsense_qa": 4.7108596404393515,
            "piqa": 5.382528460025787,
            "openbook_qa": 6.792973074913025,
            "bigbench_novel_concepts": 5.464532852172852,
            "bigbench_strange_stories": 6.109231250039462,
            "bigbench_strategy_qa": 4.643807864189148,
            "lambada_openai": 6.940148357550303,
            "winograd_wsc": 4.3697484369243025,
            "winogrande": 4.725549225012461,
            "bigbench_conlang_translation": 5.606274061086701,
            "bigbench_language_identification": 5.395252362887065,
            "bigbench_conceptual_combinations": 4.886844968332828,
            "bigbench_elementary_math_qa": 5.904252747694652,
            "bigbench_dyck_languages": 5.175402106357222,
            "agi_eval_lsat_ar": 4.521311245793882,
            "bigbench_cs_algorithms": 11.26153937180837,
            "bigbench_logical_deduction": 3.299562225739161,
            "bigbench_operators": 6.527375568662371,
            "bigbench_repeat_copy_logic": 5.684973910450935,
            "simple_arithmetic_nospaces": 8.046457400341996,
            "simple_arithmetic_withspaces": 7.376765445500863,
            "math_qa": 5.683131734530131,
            "logi_qa": 4.550697343506938,
            "pubmed_qa_labeled": 8.353316874063315,
            "squad": 6.0020116845766704,
            "agi_eval_lsat_rc": 5.060978291639641,
            "agi_eval_lsat_lr": 4.969360875148399,
            "coqa": 6.266214481989542,
            "bigbench_understanding_fables": 5.380941673561379,
            "boolq": 9.728593055407206,
            "agi_eval_sat_en": 4.995155068277155,
            "winogender_mc_female": 6.745009128252665,
            "winogender_mc_male": 5.909033528963724,
            "enterprise_pii_classification": 7.723163735866547,
            "bbq": 4.839430536826452,
            "human_eval_return_complex": 3.9139008127798247,
            "human_eval_return_simple": 5.685212521939664,
            "human_eval-0.5": 3.838829044888659,
            "human_eval-0.25": 3.8320849886754664,
            "human_eval-0.75": 3.944304703212366,
            "human_eval": 3.894328748307577,
            "processed_human_eval_cpp": 3.5242626089487015,
            "processed_human_eval_js": 3.0221364934269976
        },
        "icl": {
            "mmlu_zeroshot": 0.23198364126054863,
            "mmlu": 0.23290939537579552,
            "hellaswag_zeroshot": 0.24845647811889648,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.0,
            "arc_easy": 0.26473063230514526,
            "arc_challenge": 0.21587030589580536,
            "bigbench_misconceptions": 0.4885844886302948,
            "copa": 0.4699999988079071,
            "siqa": 0.48567041754722595,
            "commonsense_qa": 0.2203112244606018,
            "piqa": 0.5413492918014526,
            "openbook_qa": 0.23999999463558197,
            "bigbench_novel_concepts": 0.21875,
            "bigbench_strange_stories": 0.5,
            "bigbench_strategy_qa": 0.5255570411682129,
            "lambada_openai": 0.003104987321421504,
            "hellaswag": 0.24337781965732574,
            "winograd": 0.5091575384140015,
            "winogrande": 0.4964483082294464,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2522999942302704,
            "bigbench_conceptual_combinations": 0.3009708821773529,
            "bigbench_elementary_math_qa": 0.26501572132110596,
            "bigbench_dyck_languages": 0.004000000189989805,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.012878787703812122,
            "bigbench_logical_deduction": 0.257999986410141,
            "bigbench_operators": 0.0714285746216774,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.003000000026077032,
            "simple_arithmetic_withspaces": 0.004999999888241291,
            "math_qa": 0.24941334128379822,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.0,
            "squad": 9.460737783228979e-05,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0032569209579378366,
            "bigbench_understanding_fables": 0.26455026865005493,
            "boolq": 0.3782874643802643,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.46666666865348816,
            "winogender_mc_male": 0.5,
            "enterprise_pii_classification": 0.5063328146934509,
            "bbq": 0.44819845123724505
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.060071057329575225,
        "language understanding": 0.009307114106680576,
        "reading comprehension": -0.009997201600526749,
        "safety": -0.03940103270790793,
        "symbolic problem solving": 0.019591009254670808,
        "world knowledge": -0.011896548703323429
    },
    "aggregated_centered_results": 0.01012032111858187,
    "aggregated_results": 0.22382173424265617,
    "rw_small": 0.34563219795624417,
    "95%_CI_above": 0.21171380188141925,
    "99%_CI_above": 0.18620690835261486,
    "model_uuid": "4c95a0ba-21ed-46ca-8965-b1a27167e3c9",
    "low_variance_datasets": 0.20005181310691775
}