{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "ac492edb-f500-4cdf-879d-92b0cd9102bc",
    "model": "d=512_l=8_h=4",
    "creation_date": "2024_02_09-21_05_49",
    "eval_metrics": {
        "perplexity": 4.565670736630758,
        "downstream_perpexity": {
            "mmlu": 4.659276648362478,
            "hellaswag": 3.4891374667485553,
            "jeopardy_all": 5.449879437685013,
            "triviaqa_sm_sub": 5.643108389774958,
            "gsm8k": 3.9677710076173147,
            "agi_eval_sat_math": 4.320747128399936,
            "aqua": 4.225015214024758,
            "svamp": 4.876692229906718,
            "bigbench_qa_wikidata": 7.502311372756958,
            "arc_easy": 4.700574153661728,
            "arc_challenge": 4.4825659970442455,
            "bigbench_misconceptions": 6.099899485775325,
            "copa": 3.873877670764923,
            "siqa": 5.023741116126378,
            "commonsense_qa": 4.831084940830866,
            "piqa": 3.856841601928075,
            "openbook_qa": 5.693548871994018,
            "bigbench_novel_concepts": 4.4884859919548035,
            "bigbench_strange_stories": 4.998767066275937,
            "bigbench_strategy_qa": 3.3568828761577607,
            "lambada_openai": 4.333368588487307,
            "winograd_wsc": 3.6090977851287787,
            "winogrande": 3.928544694185257,
            "bigbench_conlang_translation": 3.4635089345094636,
            "bigbench_language_identification": 4.187685666481654,
            "bigbench_conceptual_combinations": 1.5392177996126193,
            "bigbench_elementary_math_qa": 5.397983219226202,
            "bigbench_dyck_languages": 7.1215054262586,
            "agi_eval_lsat_ar": 4.967889454053796,
            "bigbench_cs_algorithms": 7.1195927381515505,
            "bigbench_logical_deduction": 1.0336716478069623,
            "bigbench_operators": 6.073266374497186,
            "bigbench_repeat_copy_logic": 3.0812078788876534,
            "simple_arithmetic_nospaces": 7.497496604418554,
            "simple_arithmetic_withspaces": 6.845395088195801,
            "math_qa": 4.25617151260376,
            "logi_qa": 4.855441869861703,
            "pubmed_qa_labeled": 9.476987874808431,
            "squad": 3.89427335858345,
            "agi_eval_lsat_rc": 5.436761039406506,
            "agi_eval_lsat_lr": 5.141873410168816,
            "coqa": 4.331069934368133,
            "bigbench_understanding_fables": 5.016830996861533,
            "boolq": 5.78572948773702,
            "agi_eval_sat_en": 5.618899530577428,
            "winogender_mc_female": 4.630325321356455,
            "winogender_mc_male": 3.84143906434377,
            "enterprise_pii_classification": 4.909988975524902,
            "bbq": 0.9175383838514487,
            "human_eval_return_complex": 4.988392893723615,
            "human_eval_return_simple": 7.550731697598019,
            "human_eval-0.5": 4.903841576925138,
            "human_eval-0.25": 5.0028796573964565,
            "human_eval-0.75": 5.065644357262588,
            "human_eval": 5.065457832522508,
            "processed_human_eval_cpp": 5.377663997389515,
            "processed_human_eval_js": 5.3387881110354165
        },
        "icl": {
            "mmlu_zeroshot": 0.23134924523663103,
            "mmlu": 0.23680409266237626,
            "hellaswag_zeroshot": 0.27136027812957764,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.05634565278887749,
            "arc_easy": 0.31607744097709656,
            "arc_challenge": 0.22098976373672485,
            "bigbench_misconceptions": 0.4885844886302948,
            "copa": 0.4699999988079071,
            "siqa": 0.5,
            "commonsense_qa": 0.28828829526901245,
            "piqa": 0.592491865158081,
            "openbook_qa": 0.25999999046325684,
            "bigbench_novel_concepts": 0.28125,
            "bigbench_strange_stories": 0.43103447556495667,
            "bigbench_strategy_qa": 0.4954128563404083,
            "lambada_openai": 0.1465165913105011,
            "hellaswag": 0.2655845582485199,
            "winograd": 0.5347985625267029,
            "winogrande": 0.507498025894165,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2493000030517578,
            "bigbench_conceptual_combinations": 0.2330097109079361,
            "bigbench_elementary_math_qa": 0.2643868029117584,
            "bigbench_dyck_languages": 0.004999999888241291,
            "agi_eval_lsat_ar": 0.20434781908988953,
            "bigbench_cs_algorithms": 0.35151514410972595,
            "bigbench_logical_deduction": 0.23333333432674408,
            "bigbench_operators": 0.0714285746216774,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.004000000189989805,
            "simple_arithmetic_withspaces": 0.0010000000474974513,
            "math_qa": 0.23801541328430176,
            "logi_qa": 0.20122887194156647,
            "pubmed_qa_labeled": 0.23499999940395355,
            "squad": 0.0012298959773033857,
            "agi_eval_lsat_rc": 0.2798507511615753,
            "agi_eval_lsat_lr": 0.272549033164978,
            "coqa": 0.01002129539847374,
            "bigbench_understanding_fables": 0.2857142984867096,
            "boolq": 0.522629976272583,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.4000000059604645,
            "winogender_mc_male": 0.550000011920929,
            "enterprise_pii_classification": 0.48571428656578064,
            "bbq": 0.4416305124759674
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.0767411800722281,
        "language understanding": 0.03209781274199486,
        "reading comprehension": 0.05961972041404806,
        "safety": -0.06132759153842926,
        "symbolic problem solving": 0.037185970375624794,
        "world knowledge": 0.005101085857875243
    },
    "aggregated_centered_results": 0.033352109454310296,
    "aggregated_results": 0.24394395280026537,
    "rw_small": 0.38475924730300903,
    "95%_CI_above": 0.24897899887679767,
    "99%_CI_above": 0.23267352058552204,
    "model_uuid": "b426fc01-4b92-47ee-9596-ab5226f2e046",
    "low_variance_datasets": 0.24297380598727614
}