{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "a1bc6ae8-0125-4ca0-82ce-43cc7f96465b",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-20_25_03",
    "eval_metrics": {
        "perplexity": 5.60438783566157,
        "downstream_perpexity": {
            "mmlu": 5.478638497988383,
            "hellaswag": 4.3241539478302,
            "jeopardy_all": 6.836886954307556,
            "triviaqa_sm_sub": 6.720737326145172,
            "gsm8k": 4.867124271392822,
            "agi_eval_sat_math": 5.800521753051064,
            "aqua": 4.995204641381089,
            "svamp": 4.96195082505544,
            "bigbench_qa_wikidata": 8.312567289670309,
            "arc_easy": 5.5826855301856995,
            "arc_challenge": 5.330687940120697,
            "bigbench_misconceptions": 5.62325832506293,
            "copa": 4.835743265151978,
            "siqa": 5.555394661426544,
            "commonsense_qa": 5.744391473134359,
            "piqa": 4.73629010518392,
            "openbook_qa": 6.559520210266113,
            "bigbench_novel_concepts": 4.957139372825623,
            "bigbench_strange_stories": 5.744706543012597,
            "bigbench_strategy_qa": 5.4295748194058735,
            "lambada_openai": 6.014195098479589,
            "winograd_wsc": 4.284984277281569,
            "winogrande": 4.598196766773859,
            "bigbench_conlang_translation": 4.623123075903916,
            "bigbench_language_identification": 4.531593489646911,
            "bigbench_conceptual_combinations": 3.1467698222225153,
            "bigbench_elementary_math_qa": 5.689628680547078,
            "bigbench_dyck_languages": 8.822983756786636,
            "agi_eval_lsat_ar": 5.13201031477555,
            "bigbench_cs_algorithms": 10.1396457751592,
            "bigbench_logical_deduction": 1.71163762708505,
            "bigbench_operators": 5.99869502158392,
            "bigbench_repeat_copy_logic": 4.439150393009186,
            "simple_arithmetic_nospaces": 7.8348449638911655,
            "simple_arithmetic_withspaces": 7.370885605571651,
            "math_qa": 5.166134675343831,
            "logi_qa": 5.393588997435094,
            "pubmed_qa_labeled": 8.41292676154305,
            "squad": 5.259735512733459,
            "agi_eval_lsat_rc": 5.47513957699733,
            "agi_eval_lsat_lr": 5.622873957016889,
            "coqa": 5.963221081097921,
            "bigbench_understanding_fables": 5.683408790164524,
            "boolq": 7.84046509663264,
            "agi_eval_sat_en": 5.627837782924615,
            "winogender_mc_female": 5.019342788060507,
            "winogender_mc_male": 4.387675507863363,
            "enterprise_pii_classification": 8.515670080979666,
            "bbq": 3.0622177928686143,
            "human_eval_return_complex": 6.400695004801112,
            "human_eval_return_simple": 8.896567989040065,
            "human_eval-0.5": 6.4170599739725995,
            "human_eval-0.25": 6.445356212011197,
            "human_eval-0.75": 6.54280097310136,
            "human_eval": 6.430072668122082,
            "processed_human_eval_cpp": 6.759089795699031,
            "processed_human_eval_js": 6.564584388965514
        },
        "icl": {
            "mmlu_zeroshot": 0.23143000874603004,
            "mmlu": 0.23305184773185797,
            "hellaswag_zeroshot": 0.2511451840400696,
            "jeopardy": 0.0005361930467188359,
            "triviaqa_sm_sub": 0.0006666666595265269,
            "gsm8k": 0.0022744503803551197,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.01224489789456129,
            "svamp": 0.013333333656191826,
            "bigbench_qa_wikidata": 0.013582008890807629,
            "arc_easy": 0.2861952781677246,
            "arc_challenge": 0.20563140511512756,
            "bigbench_misconceptions": 0.4885844886302948,
            "copa": 0.4000000059604645,
            "siqa": 0.48720574378967285,
            "commonsense_qa": 0.23669123649597168,
            "piqa": 0.5565832257270813,
            "openbook_qa": 0.22200000286102295,
            "bigbench_novel_concepts": 0.34375,
            "bigbench_strange_stories": 0.4482758641242981,
            "bigbench_strategy_qa": 0.5102664828300476,
            "lambada_openai": 0.025810208171606064,
            "hellaswag": 0.24935272336006165,
            "winograd": 0.5054945349693298,
            "winogrande": 0.5209155678749084,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.25360000133514404,
            "bigbench_conceptual_combinations": 0.26213592290878296,
            "bigbench_elementary_math_qa": 0.2646488547325134,
            "bigbench_dyck_languages": 0.02199999988079071,
            "agi_eval_lsat_ar": 0.22173912823200226,
            "bigbench_cs_algorithms": 0.10454545170068741,
            "bigbench_logical_deduction": 0.2613333463668823,
            "bigbench_operators": 0.09047619253396988,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0020000000949949026,
            "simple_arithmetic_withspaces": 0.004000000189989805,
            "math_qa": 0.24404960870742798,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.0007568590226583183,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.3782874643802643,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.5,
            "winogender_mc_male": 0.5833333134651184,
            "enterprise_pii_classification": 0.5092783570289612,
            "bbq": 0.43121781945228577
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.05932303642233212,
        "language understanding": 0.012534440339853367,
        "reading comprehension": -0.011203371082956435,
        "safety": 0.011914744973182678,
        "symbolic problem solving": 0.029392530184446108,
        "world knowledge": -0.008287679307671313
    },
    "aggregated_centered_results": 0.017793693012389887,
    "aggregated_results": 0.2294218635150391,
    "rw_small": 0.3403624842564265,
    "95%_CI_above": 0.2188921336577429,
    "99%_CI_above": 0.192665152459871,
    "model_uuid": "760cfcef-27be-4d9d-8a9b-3248f8e3860d",
    "low_variance_datasets": 0.20660648508029142
}