{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "73387070-659a-4a31-ba02-c8a29e81f890",
    "model": "d=576_l=24_h=8",
    "creation_date": "2024_02_12-14_18_17",
    "eval_metrics": {
        "perplexity": 4.795270991325379,
        "downstream_perpexity": {
            "mmlu": 4.769954790671666,
            "hellaswag": 3.654067673285802,
            "jeopardy_all": 5.720346887906392,
            "triviaqa_sm_sub": 5.773881481091181,
            "gsm8k": 4.262684863805771,
            "agi_eval_sat_math": 4.57314502325925,
            "aqua": 4.479659642005453,
            "svamp": 4.921536300977071,
            "bigbench_qa_wikidata": 7.721122554937998,
            "arc_easy": 4.799196221431097,
            "arc_challenge": 4.57891534169515,
            "bigbench_misconceptions": 5.525272206084369,
            "copa": 4.028899331092834,
            "siqa": 4.800618433952332,
            "commonsense_qa": 4.74686686595281,
            "piqa": 3.9839930633703866,
            "openbook_qa": 5.832609591484069,
            "bigbench_novel_concepts": 4.582437455654144,
            "bigbench_strange_stories": 5.18443663640954,
            "bigbench_strategy_qa": 4.038564610481262,
            "lambada_openai": 5.067727868755658,
            "winograd_wsc": 3.676346791969551,
            "winogrande": 4.021029396851858,
            "bigbench_conlang_translation": 3.7528496544535566,
            "bigbench_language_identification": 4.0767204860846205,
            "bigbench_conceptual_combinations": 2.138251119446986,
            "bigbench_elementary_math_qa": 5.419869524240494,
            "bigbench_dyck_languages": 6.871906710272076,
            "agi_eval_lsat_ar": 4.770615884532099,
            "bigbench_cs_algorithms": 7.953945648670197,
            "bigbench_logical_deduction": 1.1464784334103266,
            "bigbench_operators": 5.666245535441807,
            "bigbench_repeat_copy_logic": 3.3900045827031136,
            "simple_arithmetic_nospaces": 7.360232348201656,
            "simple_arithmetic_withspaces": 6.8159151357762955,
            "math_qa": 4.282194330294927,
            "logi_qa": 5.12065804535709,
            "pubmed_qa_labeled": 8.628563496245055,
            "squad": 4.449242083231608,
            "agi_eval_lsat_rc": 5.131521486524326,
            "agi_eval_lsat_lr": 5.22672075103311,
            "coqa": 4.76328199505806,
            "bigbench_understanding_fables": 4.775247296328267,
            "boolq": 7.099713663260142,
            "agi_eval_sat_en": 5.2115062000682055,
            "winogender_mc_female": 5.228576429684957,
            "winogender_mc_male": 4.473820590972901,
            "enterprise_pii_classification": 6.317386543750763,
            "bbq": 1.7994176293412845,
            "human_eval_return_complex": 5.324595105929638,
            "human_eval_return_simple": 7.8658357182064575,
            "human_eval-0.5": 5.29992588845695,
            "human_eval-0.25": 5.3835608610292764,
            "human_eval-0.75": 5.428639967267106,
            "human_eval": 5.412155238593497,
            "processed_human_eval_cpp": 5.66183182319499,
            "processed_human_eval_js": 5.703259479708787
        },
        "icl": {
            "mmlu_zeroshot": 0.231260810243456,
            "mmlu": 0.24001781289514743,
            "hellaswag_zeroshot": 0.2576180100440979,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.0657447949051857,
            "arc_easy": 0.30513468384742737,
            "arc_challenge": 0.21672354638576508,
            "bigbench_misconceptions": 0.49771690368652344,
            "copa": 0.5199999809265137,
            "siqa": 0.4994882345199585,
            "commonsense_qa": 0.2702702581882477,
            "piqa": 0.573993444442749,
            "openbook_qa": 0.24799999594688416,
            "bigbench_novel_concepts": 0.40625,
            "bigbench_strange_stories": 0.4137931168079376,
            "bigbench_strategy_qa": 0.479248583316803,
            "lambada_openai": 0.08286435157060623,
            "hellaswag": 0.2597092092037201,
            "winograd": 0.5274725556373596,
            "winogrande": 0.4877663850784302,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2567000091075897,
            "bigbench_conceptual_combinations": 0.28155338764190674,
            "bigbench_elementary_math_qa": 0.27384695410728455,
            "bigbench_dyck_languages": 0.007000000216066837,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.14696969091892242,
            "bigbench_logical_deduction": 0.2473333328962326,
            "bigbench_operators": 0.0714285746216774,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.0010000000474974513,
            "math_qa": 0.2467314749956131,
            "logi_qa": 0.23195084929466248,
            "pubmed_qa_labeled": 0.27000001072883606,
            "squad": 0.0009460737928748131,
            "agi_eval_lsat_rc": 0.24253731966018677,
            "agi_eval_lsat_lr": 0.2803921699523926,
            "coqa": 0.027433296665549278,
            "bigbench_understanding_fables": 0.27513226866722107,
            "boolq": 0.3798165023326874,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.5666666626930237,
            "winogender_mc_male": 0.5666666626930237,
            "enterprise_pii_classification": 0.5066273808479309,
            "bbq": 0.4368714229627089
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.08571755141019821,
        "language understanding": 0.023431215124825638,
        "reading comprehension": 0.023756988307771586,
        "safety": 0.0384160645983436,
        "symbolic problem solving": 0.030456499633793203,
        "world knowledge": 0.006545134179632124
    },
    "aggregated_centered_results": 0.03384546794273907,
    "aggregated_results": 0.24377673238514042,
    "rw_small": 0.3611765181024869,
    "95%_CI_above": 0.24291599829201446,
    "99%_CI_above": 0.2180642282217741,
    "model_uuid": "34fbe81e-1f06-493a-83b5-0c096e5ef91e",
    "low_variance_datasets": 0.22416719650341707
}