{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "4b4b487c-d0bd-4d82-bffb-9b2eebd138bf",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_01_29-20_28_06",
    "eval_metrics": {
        "perplexity": 4.312821765740712,
        "downstream_perpexity": {
            "mmlu": 4.942943485577901,
            "hellaswag": 4.441753393411636,
            "jeopardy_all": 6.309240094820658,
            "triviaqa_sm_sub": 6.3645256082216894,
            "gsm8k": 4.411851906776429,
            "agi_eval_sat_math": 3.196599215810949,
            "aqua": 4.511555939304586,
            "svamp": 4.657570473353068,
            "bigbench_qa_wikidata": 8.473536086082458,
            "arc_easy": 5.544767795006434,
            "arc_challenge": 5.1490527272224424,
            "bigbench_misconceptions": 6.515240225073409,
            "copa": 4.6383309650421145,
            "siqa": 5.249826145172119,
            "commonsense_qa": 5.453530295689901,
            "piqa": 5.008208056290944,
            "openbook_qa": 6.452971439361573,
            "bigbench_novel_concepts": 5.152050510048866,
            "bigbench_strange_stories": 5.676178543046973,
            "bigbench_strategy_qa": 4.841753506660462,
            "lambada_openai": 5.528528116146723,
            "winograd_wsc": 4.110338988321605,
            "winogrande": 4.526752243439357,
            "bigbench_conlang_translation": 4.214493734080617,
            "bigbench_language_identification": 4.339650742212931,
            "bigbench_conceptual_combinations": 2.6511336148363873,
            "bigbench_elementary_math_qa": 5.199068037668864,
            "bigbench_dyck_languages": 6.32502118429216,
            "agi_eval_lsat_ar": 5.05042375067006,
            "bigbench_cs_algorithms": 8.254282108942668,
            "bigbench_logical_deduction": 1.5116089483102162,
            "bigbench_operators": 6.444374207087925,
            "bigbench_repeat_copy_logic": 3.8440572023391724,
            "simple_arithmetic_nospaces": 7.243881834655249,
            "simple_arithmetic_withspaces": 7.5880215358333425,
            "math_qa": 4.4002168814341225,
            "logi_qa": 5.12381931860143,
            "pubmed_qa_labeled": 9.931834002502827,
            "squad": 4.612104034423828,
            "agi_eval_lsat_rc": 5.2909155016514795,
            "agi_eval_lsat_lr": 5.308278822431378,
            "coqa": 5.310937517881394,
            "bigbench_understanding_fables": 4.392991378824547,
            "boolq": 7.5029977560043335,
            "agi_eval_sat_en": 5.323927809891192,
            "winogender_mc_female": 5.987926880518596,
            "winogender_mc_male": 4.917484060923258,
            "enterprise_pii_classification": 7.709266889095306,
            "bbq": 2.2605604539314905,
            "human_eval_return_complex": 3.350829413556677,
            "human_eval_return_simple": 5.601498126983643,
            "human_eval-0.5": 3.2159369151766706,
            "human_eval-0.25": 3.251542534770035,
            "human_eval-0.75": 3.3181205566336467,
            "human_eval": 3.34687738011523,
            "processed_human_eval_cpp": 2.934647087725053,
            "processed_human_eval_js": 2.462752275350617
        },
        "icl": {
            "mmlu_zeroshot": 0.23129551112651825,
            "mmlu": 0.23527908312124118,
            "hellaswag_zeroshot": 0.25204142928123474,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.0010000000474974513,
            "gsm8k": 0.0007581501267850399,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.027803750708699226,
            "arc_easy": 0.28703704476356506,
            "arc_challenge": 0.22013652324676514,
            "bigbench_misconceptions": 0.4840182662010193,
            "copa": 0.5,
            "siqa": 0.4938587546348572,
            "commonsense_qa": 0.27436527609825134,
            "piqa": 0.5473340749740601,
            "openbook_qa": 0.23800000548362732,
            "bigbench_novel_concepts": 0.3125,
            "bigbench_strange_stories": 0.44252872467041016,
            "bigbench_strategy_qa": 0.4897335171699524,
            "lambada_openai": 0.05317290872335434,
            "hellaswag": 0.25014936923980713,
            "winograd": 0.5054945349693298,
            "winogrande": 0.5011839270591736,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.25060001015663147,
            "bigbench_conceptual_combinations": 0.3300970792770386,
            "bigbench_elementary_math_qa": 0.27036163210868835,
            "bigbench_dyck_languages": 0.019999999552965164,
            "agi_eval_lsat_ar": 0.19565217196941376,
            "bigbench_cs_algorithms": 0.39924243092536926,
            "bigbench_logical_deduction": 0.2553333342075348,
            "bigbench_operators": 0.08571428805589676,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.004999999888241291,
            "simple_arithmetic_withspaces": 0.006000000052154064,
            "math_qa": 0.24237345159053802,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.003000000026077032,
            "squad": 0.0009460737928748131,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27450981736183167,
            "coqa": 0.0016284604789689183,
            "bigbench_understanding_fables": 0.2857142984867096,
            "boolq": 0.4388379156589508,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.44999998807907104,
            "winogender_mc_male": 0.6499999761581421,
            "enterprise_pii_classification": 0.5119293332099915,
            "bbq": 0.44372573494911194
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.06834131479263306,
        "language understanding": 0.02213087708999713,
        "reading comprehension": 0.008292030538238274,
        "safety": 0.027827516198158264,
        "symbolic problem solving": 0.04520157886048158,
        "world knowledge": -0.004770270895538987
    },
    "aggregated_centered_results": 0.030221281566715538,
    "aggregated_results": 0.24025014839700568,
    "rw_small": 0.36653947333494824,
    "95%_CI_above": 0.23644163242618862,
    "99%_CI_above": 0.21257710673380642,
    "model_uuid": "1084ea36-5383-4043-8c42-00f5a786b48e",
    "low_variance_datasets": 0.2295154634154063
}