{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "a66dd6f5-b88e-4fb6-8468-e0c0ba6fa87a",
    "model": "d=512_l=8_h=4",
    "creation_date": "2024_02_12-09_34_17",
    "eval_metrics": {
        "perplexity": 3.921025884151459,
        "downstream_perpexity": {
            "mmlu": 4.862290426095327,
            "hellaswag": 3.376871846119563,
            "jeopardy_all": 5.040708843866984,
            "triviaqa_sm_sub": 5.177205181121826,
            "gsm8k": 3.4634249448776244,
            "agi_eval_sat_math": 2.8817618771032856,
            "aqua": 3.6910606608098866,
            "svamp": 4.364497334162394,
            "bigbench_qa_wikidata": 6.940819537639618,
            "arc_easy": 4.464545937379201,
            "arc_challenge": 4.223764284451803,
            "bigbench_misconceptions": 6.167237732508411,
            "copa": 3.774786822795868,
            "siqa": 5.252563281853994,
            "commonsense_qa": 5.387643380959829,
            "piqa": 3.7862015863259635,
            "openbook_qa": 5.525064382553101,
            "bigbench_novel_concepts": 4.225339964032173,
            "bigbench_strange_stories": 3.983130703027221,
            "bigbench_strategy_qa": 3.1555442214012146,
            "lambada_openai": 3.6053626616795857,
            "winograd_wsc": 3.418608281201932,
            "winogrande": 3.818396580219269,
            "bigbench_conlang_translation": 3.157907900286884,
            "bigbench_language_identification": 5.666616268952688,
            "bigbench_conceptual_combinations": 1.4610870787240926,
            "bigbench_elementary_math_qa": 5.2390926539897915,
            "bigbench_dyck_languages": 5.6316558508312005,
            "agi_eval_lsat_ar": 5.105568780069766,
            "bigbench_cs_algorithms": 6.688132333755493,
            "bigbench_logical_deduction": 1.3315095792214076,
            "bigbench_operators": 6.175288677215576,
            "bigbench_repeat_copy_logic": 2.770679421722889,
            "simple_arithmetic_nospaces": 7.330710727126658,
            "simple_arithmetic_withspaces": 7.136766970658503,
            "math_qa": 6.367481287320455,
            "logi_qa": 4.741228766521917,
            "pubmed_qa_labeled": 6.877171615091693,
            "squad": 3.076729593674342,
            "agi_eval_lsat_rc": 5.57746986043987,
            "agi_eval_lsat_lr": 5.423788938335344,
            "coqa": 3.824927763144175,
            "bigbench_understanding_fables": 6.005099710333284,
            "boolq": 5.075347173213959,
            "agi_eval_sat_en": 5.731676011409574,
            "winogender_mc_female": 4.648295573393503,
            "winogender_mc_male": 3.652836604913076,
            "enterprise_pii_classification": 5.392138961950938,
            "bbq": 0.6912334534029166,
            "human_eval_return_complex": 5.198904683270793,
            "human_eval_return_simple": 8.519833745183172,
            "human_eval-0.5": 5.1123344781922135,
            "human_eval-0.25": 5.274180894944726,
            "human_eval-0.75": 5.16813029312506,
            "human_eval": 5.449433855894135,
            "processed_human_eval_cpp": 4.654831758937481,
            "processed_human_eval_js": 4.182844979007069
        },
        "icl": {
            "mmlu_zeroshot": 0.2311098831787444,
            "mmlu": 0.243327855540995,
            "hellaswag_zeroshot": 0.2661820352077484,
            "jeopardy": 0.0005361930467188359,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.005307050887495279,
            "agi_eval_sat_math": 0.00909090880304575,
            "aqua": 0.008163264952600002,
            "svamp": 0.0033333334140479565,
            "bigbench_qa_wikidata": 0.10368584096431732,
            "arc_easy": 0.33375421166419983,
            "arc_challenge": 0.2226962447166443,
            "bigbench_misconceptions": 0.4840182662010193,
            "copa": 0.44999998807907104,
            "siqa": 0.501023530960083,
            "commonsense_qa": 0.3013922870159149,
            "piqa": 0.5892274379730225,
            "openbook_qa": 0.24799999594688416,
            "bigbench_novel_concepts": 0.375,
            "bigbench_strange_stories": 0.4482758641242981,
            "bigbench_strategy_qa": 0.49235475063323975,
            "lambada_openai": 0.19076266884803772,
            "hellaswag": 0.268074095249176,
            "winograd": 0.5091575384140015,
            "winogrande": 0.5130228996276855,
            "bigbench_conlang_translation": 0.018292682245373726,
            "bigbench_language_identification": 0.2556999921798706,
            "bigbench_conceptual_combinations": 0.27184465527534485,
            "bigbench_elementary_math_qa": 0.26677149534225464,
            "bigbench_dyck_languages": 0.013000000268220901,
            "agi_eval_lsat_ar": 0.20434781908988953,
            "bigbench_cs_algorithms": 0.36666667461395264,
            "bigbench_logical_deduction": 0.2293333262205124,
            "bigbench_operators": 0.10000000149011612,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.004000000189989805,
            "simple_arithmetic_withspaces": 0.004999999888241291,
            "math_qa": 0.24874287843704224,
            "logi_qa": 0.25038403272628784,
            "pubmed_qa_labeled": 0.45500001311302185,
            "squad": 0.0016083254013210535,
            "agi_eval_lsat_rc": 0.24626865983009338,
            "agi_eval_lsat_lr": 0.2705882489681244,
            "coqa": 0.044344231486320496,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.6067278385162354,
            "agi_eval_sat_en": 0.2669903039932251,
            "winogender_mc_female": 0.5166666507720947,
            "winogender_mc_male": 0.4333333373069763,
            "enterprise_pii_classification": 0.4865979254245758,
            "bbq": 0.43335921656001697
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.09303577492634454,
        "language understanding": 0.0419771580491215,
        "reading comprehension": 0.09648544196776736,
        "safety": -0.06502143496816809,
        "symbolic problem solving": 0.04750342425993747,
        "world knowledge": 0.01422201998461867
    },
    "aggregated_centered_results": 0.047416250655866785,
    "aggregated_results": 0.25593445180114055,
    "rw_small": 0.3987305363019307,
    "95%_CI_above": 0.27021224352178363,
    "99%_CI_above": 0.25133698601834475,
    "model_uuid": "f859cec4-21b6-4818-a08b-3315a17437e2",
    "low_variance_datasets": 0.2540402872636067
}