{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "64b3841f-897d-4d84-9eba-cfc49fe75cae",
    "model": "d=576_l=24_h=8",
    "creation_date": "2024_01_29-21_58_58",
    "eval_metrics": {
        "perplexity": 3.8958526651064553,
        "downstream_perpexity": {
            "mmlu": 4.943862060705821,
            "hellaswag": 3.3532290438810985,
            "jeopardy_all": 5.004068885246912,
            "triviaqa_sm_sub": 5.154722559452057,
            "gsm8k": 3.487234185139338,
            "agi_eval_sat_math": 2.932788268002597,
            "aqua": 3.733088682135757,
            "svamp": 4.409879370530446,
            "bigbench_qa_wikidata": 6.822569521268209,
            "arc_easy": 4.4623885234196985,
            "arc_challenge": 4.217174108823141,
            "bigbench_misconceptions": 5.6428570399001305,
            "copa": 3.649579243659973,
            "siqa": 4.57468843460083,
            "commonsense_qa": 5.111469614505768,
            "piqa": 3.7883237143357595,
            "openbook_qa": 5.506979932785034,
            "bigbench_novel_concepts": 4.355368748307228,
            "bigbench_strange_stories": 4.096383509964778,
            "bigbench_strategy_qa": 2.690538215637207,
            "lambada_openai": 3.769896547993024,
            "winograd_wsc": 3.4114427256496835,
            "winogrande": 3.7722873787085214,
            "bigbench_conlang_translation": 3.255275266926463,
            "bigbench_language_identification": 5.6654383103052774,
            "bigbench_conceptual_combinations": 1.4546860734235894,
            "bigbench_elementary_math_qa": 5.104619467258454,
            "bigbench_dyck_languages": 5.57937102658408,
            "agi_eval_lsat_ar": 5.482575248635333,
            "bigbench_cs_algorithms": 7.7297362923622135,
            "bigbench_logical_deduction": 1.6004199494918188,
            "bigbench_operators": 6.157809673036848,
            "bigbench_repeat_copy_logic": 2.6169581674039364,
            "simple_arithmetic_nospaces": 7.186747462308707,
            "simple_arithmetic_withspaces": 7.074114357723909,
            "math_qa": 6.0248691956202185,
            "logi_qa": 5.331973867299187,
            "pubmed_qa_labeled": 7.457757424406645,
            "squad": 3.14534884194533,
            "agi_eval_lsat_rc": 5.6191032573358335,
            "agi_eval_lsat_lr": 5.761138801013722,
            "coqa": 3.9173480808734893,
            "bigbench_understanding_fables": 6.145620469693784,
            "boolq": 5.1842873930931095,
            "agi_eval_sat_en": 5.675102053336727,
            "winogender_mc_female": 4.174624149004618,
            "winogender_mc_male": 3.3403882582982383,
            "enterprise_pii_classification": 3.878486867745717,
            "bbq": 0.8051202561706304,
            "human_eval_return_complex": 4.691968320861576,
            "human_eval_return_simple": 8.585956354398984,
            "human_eval-0.5": 4.631401438538621,
            "human_eval-0.25": 4.80302255618863,
            "human_eval-0.75": 4.686219872497931,
            "human_eval": 4.972517403160653,
            "processed_human_eval_cpp": 4.536414341896958,
            "processed_human_eval_js": 4.015717423543697
        },
        "icl": {
            "mmlu_zeroshot": 0.23307304110443383,
            "mmlu": 0.24189594126584238,
            "hellaswag_zeroshot": 0.26787492632865906,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0007581501267850399,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.10757344961166382,
            "arc_easy": 0.32154881954193115,
            "arc_challenge": 0.1962457299232483,
            "bigbench_misconceptions": 0.4931506812572479,
            "copa": 0.5,
            "siqa": 0.48515865206718445,
            "commonsense_qa": 0.26699426770210266,
            "piqa": 0.5919477939605713,
            "openbook_qa": 0.23000000417232513,
            "bigbench_novel_concepts": 0.3125,
            "bigbench_strange_stories": 0.4252873659133911,
            "bigbench_strategy_qa": 0.4932284951210022,
            "lambada_openai": 0.17717833817005157,
            "hellaswag": 0.26598286628723145,
            "winograd": 0.5567765831947327,
            "winogrande": 0.49802684783935547,
            "bigbench_conlang_translation": 0.006097560748457909,
            "bigbench_language_identification": 0.25110000371932983,
            "bigbench_conceptual_combinations": 0.26213592290878296,
            "bigbench_elementary_math_qa": 0.2774895131587982,
            "bigbench_dyck_languages": 0.023000000044703484,
            "agi_eval_lsat_ar": 0.208695650100708,
            "bigbench_cs_algorithms": 0.3545454442501068,
            "bigbench_logical_deduction": 0.23600000143051147,
            "bigbench_operators": 0.095238097012043,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0020000000949949026,
            "simple_arithmetic_withspaces": 0.0010000000474974513,
            "math_qa": 0.24438484013080597,
            "logi_qa": 0.23348695039749146,
            "pubmed_qa_labeled": 0.48399999737739563,
            "squad": 0.0025543991941958666,
            "agi_eval_lsat_rc": 0.21268656849861145,
            "agi_eval_lsat_lr": 0.2980392277240753,
            "coqa": 0.04772641882300377,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.5434250831604004,
            "agi_eval_sat_en": 0.24757280945777893,
            "winogender_mc_female": 0.5333333611488342,
            "winogender_mc_male": 0.5,
            "enterprise_pii_classification": 0.5036818981170654,
            "bbq": 0.45985717935995624
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.07132128874460857,
        "language understanding": 0.044459298330669604,
        "reading comprehension": 0.08056768261788723,
        "safety": -0.0015637806870720528,
        "symbolic problem solving": 0.0452176357490114,
        "world knowledge": 0.010569940154566308
    },
    "aggregated_centered_results": 0.0456344135738968,
    "aggregated_results": 0.25406648866597853,
    "rw_small": 0.3878535677989324,
    "95%_CI_above": 0.2668327770908504,
    "99%_CI_above": 0.2497509905975312,
    "model_uuid": "e35118d6-c015-4bed-8628-2d4483edca6f",
    "low_variance_datasets": 0.2502924874107438
}