{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "5921c266-54c5-4c53-9cc5-667cd46fba9e",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-16_21_51",
    "eval_metrics": {
        "perplexity": 6.319463467597961,
        "downstream_perpexity": {
            "mmlu": 5.503090047836304,
            "hellaswag": 5.9508083939552305,
            "jeopardy_all": 8.278217883904775,
            "triviaqa_sm_sub": 8.080197127660115,
            "gsm8k": 6.88970852692922,
            "agi_eval_sat_math": 5.348069360039451,
            "aqua": 6.280104391915458,
            "svamp": 5.667468082110087,
            "bigbench_qa_wikidata": 10.553320185343425,
            "arc_easy": 7.271507592995961,
            "arc_challenge": 6.819167002042135,
            "bigbench_misconceptions": 6.053521554764003,
            "copa": 5.97659282207489,
            "siqa": 5.451533861955007,
            "commonsense_qa": 5.607543746630351,
            "piqa": 6.707878847916921,
            "openbook_qa": 7.958116717338562,
            "bigbench_novel_concepts": 6.027446240186691,
            "bigbench_strange_stories": 6.843205926062047,
            "bigbench_strategy_qa": 5.661275986830393,
            "lambada_openai": 9.442887361844381,
            "winograd_wsc": 5.180010510014964,
            "winogrande": 5.67545124689738,
            "bigbench_conlang_translation": 7.5184475270713245,
            "bigbench_language_identification": 5.157641196250916,
            "bigbench_conceptual_combinations": 7.190637236659967,
            "bigbench_elementary_math_qa": 7.143185579776764,
            "bigbench_dyck_languages": 7.450511519147568,
            "agi_eval_lsat_ar": 5.332549893337747,
            "bigbench_cs_algorithms": 9.729580855369568,
            "bigbench_logical_deduction": 5.329035075505574,
            "bigbench_operators": 6.886187873567853,
            "bigbench_repeat_copy_logic": 7.8777428567409515,
            "simple_arithmetic_nospaces": 8.260592017854963,
            "simple_arithmetic_withspaces": 7.825905790849894,
            "math_qa": 5.26805739402771,
            "logi_qa": 5.532530628591089,
            "pubmed_qa_labeled": 9.539531460329265,
            "squad": 7.8393689632415775,
            "agi_eval_lsat_rc": 5.494740523509125,
            "agi_eval_lsat_lr": 5.617648637061025,
            "coqa": 7.546986166636149,
            "bigbench_understanding_fables": 5.348322580731105,
            "boolq": 8.846749885876973,
            "agi_eval_sat_en": 5.478274866215234,
            "winogender_mc_female": 9.43429455757141,
            "winogender_mc_male": 9.211011616388957,
            "enterprise_pii_classification": 8.89441324075063,
            "bbq": 7.61734398206075,
            "human_eval_return_complex": 5.235021189441831,
            "human_eval_return_simple": 5.926216937400199,
            "human_eval-0.5": 5.298889020594155,
            "human_eval-0.25": 5.286942766933906,
            "human_eval-0.75": 5.271654733797399,
            "human_eval": 5.284291476738162,
            "processed_human_eval_cpp": 5.22431276747899,
            "processed_human_eval_js": 4.646892783118457
        },
        "icl": {
            "mmlu_zeroshot": 0.23116857623844816,
            "mmlu": 0.23116857623844816,
            "hellaswag_zeroshot": 0.2457677721977234,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.0,
            "arc_easy": 0.25757575035095215,
            "arc_challenge": 0.22098976373672485,
            "bigbench_misconceptions": 0.5159817337989807,
            "copa": 0.5199999809265137,
            "siqa": 0.48413512110710144,
            "commonsense_qa": 0.1957411915063858,
            "piqa": 0.49945592880249023,
            "openbook_qa": 0.23999999463558197,
            "bigbench_novel_concepts": 0.15625,
            "bigbench_strange_stories": 0.5632184147834778,
            "bigbench_strategy_qa": 0.527304470539093,
            "lambada_openai": 0.0,
            "hellaswag": 0.24666401743888855,
            "winograd": 0.5018315315246582,
            "winogrande": 0.5169692039489746,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2502000033855438,
            "bigbench_conceptual_combinations": 0.28155338764190674,
            "bigbench_elementary_math_qa": 0.2684224247932434,
            "bigbench_dyck_languages": 0.0,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.0,
            "bigbench_logical_deduction": 0.24933333694934845,
            "bigbench_operators": 0.08095238357782364,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.007000000216066837,
            "math_qa": 0.24840764701366425,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.0,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.3782874643802643,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.6000000238418579,
            "winogender_mc_male": 0.4166666567325592,
            "enterprise_pii_classification": 0.5063328146934509,
            "bbq": 0.4793332544240085
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.057193679114182785,
        "language understanding": 0.008731047312418621,
        "reading comprehension": -0.011297978460788725,
        "safety": 0.0011663748459382617,
        "symbolic problem solving": 0.01847706487816241,
        "world knowledge": -0.005812455456605525
    },
    "aggregated_centered_results": 0.013183055149604833,
    "aggregated_results": 0.22488672702171675,
    "rw_small": 0.35659832259019214,
    "95%_CI_above": 0.20946202808708855,
    "99%_CI_above": 0.18546619027853012,
    "model_uuid": "3070bbe6-259e-4569-bd06-23e482f79e6c",
    "low_variance_datasets": 0.19911463389342482
}