{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "0f8276ce-355e-42ec-acc9-d5a32de1714a",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-14_54_56",
    "eval_metrics": {
        "perplexity": 5.929844685395559,
        "downstream_perpexity": {
            "mmlu": 5.0160352230072025,
            "hellaswag": 5.527158868312836,
            "jeopardy_all": 7.756590839227041,
            "triviaqa_sm_sub": 7.349801214536031,
            "gsm8k": 6.527832706769307,
            "agi_eval_sat_math": 4.877539925141768,
            "aqua": 5.875670228685651,
            "svamp": 5.429476613998413,
            "bigbench_qa_wikidata": 9.867216205596923,
            "arc_easy": 6.96289428472519,
            "arc_challenge": 6.479176227251688,
            "bigbench_misconceptions": 6.042052713159013,
            "copa": 5.669822068214416,
            "siqa": 4.801163224379222,
            "commonsense_qa": 5.1276620348294575,
            "piqa": 6.162451914946238,
            "openbook_qa": 7.524622845649719,
            "bigbench_novel_concepts": 5.819500625133514,
            "bigbench_strange_stories": 6.655486249375618,
            "bigbench_strategy_qa": 5.174382515748342,
            "lambada_openai": 8.772242716948192,
            "winograd_wsc": 4.93311990748395,
            "winogrande": 5.389448205629985,
            "bigbench_conlang_translation": 7.29032943597654,
            "bigbench_language_identification": 4.930123674869537,
            "bigbench_conceptual_combinations": 6.651164786329547,
            "bigbench_elementary_math_qa": 6.5774698734283445,
            "bigbench_dyck_languages": 7.035127000648434,
            "agi_eval_lsat_ar": 4.995378902684087,
            "bigbench_cs_algorithms": 8.837957902749379,
            "bigbench_logical_deduction": 4.880650595823924,
            "bigbench_operators": 6.488946251642137,
            "bigbench_repeat_copy_logic": 7.346457213163376,
            "simple_arithmetic_nospaces": 7.231689826781008,
            "simple_arithmetic_withspaces": 7.411607835974012,
            "math_qa": 5.524254024028778,
            "logi_qa": 5.011813061944167,
            "pubmed_qa_labeled": 9.063966597829547,
            "squad": 7.437412317593893,
            "agi_eval_lsat_rc": 5.064749724829375,
            "agi_eval_lsat_lr": 5.102526224360746,
            "coqa": 7.313922659556071,
            "bigbench_understanding_fables": 5.5665242331368585,
            "boolq": 8.997177402178446,
            "agi_eval_sat_en": 5.048774703035077,
            "winogender_mc_female": 8.566533327102661,
            "winogender_mc_male": 8.48206181526184,
            "enterprise_pii_classification": 8.936921898523966,
            "bbq": 7.403559509913126,
            "human_eval_return_complex": 4.92390398340901,
            "human_eval_return_simple": 5.787064874494398,
            "human_eval-0.5": 4.948273699458053,
            "human_eval-0.25": 4.930699519994782,
            "human_eval-0.75": 4.966218903297332,
            "human_eval": 4.927900840596455,
            "processed_human_eval_cpp": 4.737003761789073,
            "processed_human_eval_js": 3.942431638880474
        },
        "icl": {
            "mmlu_zeroshot": 0.23116857623844816,
            "mmlu": 0.23116857623844816,
            "hellaswag_zeroshot": 0.2469627559185028,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.0006666666595265269,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.013636363670229912,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.0,
            "arc_easy": 0.2571548819541931,
            "arc_challenge": 0.21672354638576508,
            "bigbench_misconceptions": 0.4794520437717438,
            "copa": 0.5099999904632568,
            "siqa": 0.48413512110710144,
            "commonsense_qa": 0.1957411915063858,
            "piqa": 0.5163220763206482,
            "openbook_qa": 0.25,
            "bigbench_novel_concepts": 0.21875,
            "bigbench_strange_stories": 0.522988498210907,
            "bigbench_strategy_qa": 0.47007426619529724,
            "lambada_openai": 0.0,
            "hellaswag": 0.24546903371810913,
            "winograd": 0.5018315315246582,
            "winogrande": 0.5201262831687927,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2502000033855438,
            "bigbench_conceptual_combinations": 0.26213592290878296,
            "bigbench_elementary_math_qa": 0.26580187678337097,
            "bigbench_dyck_languages": 0.0,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.0,
            "bigbench_logical_deduction": 0.25866666436195374,
            "bigbench_operators": 0.08095238357782364,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.007000000216066837,
            "math_qa": 0.24840764701366425,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.0,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.3782874643802643,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.5,
            "winogender_mc_male": 0.5,
            "enterprise_pii_classification": 0.5063328146934509,
            "bbq": 0.45456124164841394
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.047878521184126534,
        "language understanding": 0.006284072995185852,
        "reading comprehension": -0.011297978460788725,
        "safety": -0.019552971829067567,
        "symbolic problem solving": 0.019679816998541357,
        "world knowledge": -0.015684392255147485
    },
    "aggregated_centered_results": 0.00851817951703646,
    "aggregated_results": 0.2229088291563419,
    "rw_small": 0.3548758203784625,
    "95%_CI_above": 0.20867591055505205,
    "99%_CI_above": 0.18607963651418685,
    "model_uuid": "ad840351-877c-457c-be08-df34086d14af",
    "low_variance_datasets": 0.1998117318884893
}