{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "08ae335f-f8d8-4883-a8ed-0b5b78910910",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-16_31_30",
    "eval_metrics": {
        "perplexity": 5.282713107268015,
        "downstream_perpexity": {
            "mmlu": 5.601785751183828,
            "hellaswag": 4.438354001442591,
            "jeopardy_all": 6.843250131607055,
            "triviaqa_sm_sub": 6.567363484700521,
            "gsm8k": 4.6811162034670515,
            "agi_eval_sat_math": 4.747289345481179,
            "aqua": 4.731147363234539,
            "svamp": 4.7738189109166465,
            "bigbench_qa_wikidata": 8.992606496810913,
            "arc_easy": 5.77530303200086,
            "arc_challenge": 5.4426249146461485,
            "bigbench_misconceptions": 5.671508405850902,
            "copa": 4.879878664016724,
            "siqa": 6.126229695479075,
            "commonsense_qa": 6.2052500208218895,
            "piqa": 4.979520014921824,
            "openbook_qa": 6.596795108795166,
            "bigbench_novel_concepts": 5.215355545282364,
            "bigbench_strange_stories": 5.271440911567074,
            "bigbench_strategy_qa": 5.302442014217377,
            "lambada_openai": 6.108390372991562,
            "winograd_wsc": 4.291097839236696,
            "winogrande": 4.600296245018641,
            "bigbench_conlang_translation": 4.854598580337152,
            "bigbench_language_identification": 4.981206818421682,
            "bigbench_conceptual_combinations": 3.6183798012224218,
            "bigbench_elementary_math_qa": 5.954903427759806,
            "bigbench_dyck_languages": 5.321719527244568,
            "agi_eval_lsat_ar": 5.961342542067818,
            "bigbench_cs_algorithms": 8.609827403227488,
            "bigbench_logical_deduction": 2.668784342209498,
            "bigbench_operators": 6.566143237976801,
            "bigbench_repeat_copy_logic": 4.761763751506805,
            "simple_arithmetic_nospaces": 7.9037436422179725,
            "simple_arithmetic_withspaces": 8.340378290965777,
            "math_qa": 4.742527647813161,
            "logi_qa": 5.742873399854622,
            "pubmed_qa_labeled": 8.6778609291846,
            "squad": 5.117738916476568,
            "agi_eval_lsat_rc": 5.831013047873085,
            "agi_eval_lsat_lr": 5.937623412936341,
            "coqa": 5.688605165481567,
            "bigbench_understanding_fables": 4.849325427302608,
            "boolq": 7.1623533527056376,
            "agi_eval_sat_en": 5.886129309830157,
            "winogender_mc_female": 5.609361569086711,
            "winogender_mc_male": 5.206568137804667,
            "enterprise_pii_classification": 8.65307997862498,
            "bbq": 3.412656463185946,
            "human_eval_return_complex": 6.737424144594688,
            "human_eval_return_simple": 11.142046103606353,
            "human_eval-0.5": 6.655629361548075,
            "human_eval-0.25": 6.761487047846725,
            "human_eval-0.75": 6.818924773030165,
            "human_eval": 6.841344246050206,
            "processed_human_eval_cpp": 6.347583453847755,
            "processed_human_eval_js": 5.8369846256767834
        },
        "icl": {
            "mmlu_zeroshot": 0.230996131112701,
            "mmlu": 0.2315937704161594,
            "hellaswag_zeroshot": 0.25263890624046326,
            "jeopardy": 0.0005361930467188359,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0007581501267850399,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0833333358168602,
            "bigbench_qa_wikidata": 0.004970227833837271,
            "arc_easy": 0.2790403962135315,
            "arc_challenge": 0.19112628698349,
            "bigbench_misconceptions": 0.5068492889404297,
            "copa": 0.47999998927116394,
            "siqa": 0.48311156034469604,
            "commonsense_qa": 0.20311219990253448,
            "piqa": 0.5397170782089233,
            "openbook_qa": 0.23800000548362732,
            "bigbench_novel_concepts": 0.25,
            "bigbench_strange_stories": 0.44252872467041016,
            "bigbench_strategy_qa": 0.4936653673648834,
            "lambada_openai": 0.020958665758371353,
            "hellaswag": 0.25144392251968384,
            "winograd": 0.5384615659713745,
            "winogrande": 0.5003946423530579,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.24789999425411224,
            "bigbench_conceptual_combinations": 0.28155338764190674,
            "bigbench_elementary_math_qa": 0.2704402506351471,
            "bigbench_dyck_languages": 0.012000000104308128,
            "agi_eval_lsat_ar": 0.208695650100708,
            "bigbench_cs_algorithms": 0.049242425709962845,
            "bigbench_logical_deduction": 0.2566666603088379,
            "bigbench_operators": 0.05714285746216774,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.007000000216066837,
            "simple_arithmetic_withspaces": 0.0020000000949949026,
            "math_qa": 0.2463962435722351,
            "logi_qa": 0.20737327635288239,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.00037842951132915914,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.28431373834609985,
            "coqa": 0.0002505323791410774,
            "bigbench_understanding_fables": 0.28042328357696533,
            "boolq": 0.38042813539505005,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.5,
            "winogender_mc_male": 0.5166666507720947,
            "enterprise_pii_classification": 0.495728999376297,
            "bbq": 0.41870626536282624
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.03994104887048404,
        "language understanding": 0.017923253743598856,
        "reading comprehension": -0.006176460066247577,
        "safety": -0.03444904224439102,
        "symbolic problem solving": 0.026614652123923103,
        "world knowledge": -0.008764944367622344
    },
    "aggregated_centered_results": 0.011858956829862478,
    "aggregated_results": 0.22604926020888516,
    "rw_small": 0.3472713927427928,
    "95%_CI_above": 0.2146552297497295,
    "99%_CI_above": 0.19153434383077544,
    "model_uuid": "ad053e2a-ec41-4244-8aed-ea3ed07973b6",
    "low_variance_datasets": 0.2025653683956162
}