{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "13495fa5-ddbb-41c4-9965-02c342f1a807",
    "model": "d=1024_l=24_h=8",
    "creation_date": "2024_02_12-13_50_09",
    "eval_metrics": {
        "perplexity": 2.6416500906149545,
        "downstream_perpexity": {
            "mmlu": 4.893110092480978,
            "hellaswag": 2.918187596400579,
            "jeopardy_all": 3.1992373198270796,
            "triviaqa_sm_sub": 3.6257577409346897,
            "gsm8k": 2.5252323408921558,
            "agi_eval_sat_math": 1.9453863826665012,
            "aqua": 2.988311683888338,
            "svamp": 3.0600780216852823,
            "bigbench_qa_wikidata": 5.471802874406179,
            "arc_easy": 3.500327077507973,
            "arc_challenge": 3.4280635118484497,
            "bigbench_misconceptions": 6.0463988443487855,
            "copa": 3.160622251033783,
            "siqa": 4.5849490861097975,
            "commonsense_qa": 5.080174497763315,
            "piqa": 3.223667937517166,
            "openbook_qa": 4.938306712150574,
            "bigbench_novel_concepts": 3.6657385379076004,
            "bigbench_strange_stories": 4.08556927209613,
            "bigbench_strategy_qa": 2.407182147105535,
            "lambada_openai": 2.312027034163475,
            "winograd_wsc": 3.00337309889741,
            "winogrande": 3.4762423833211265,
            "bigbench_conlang_translation": 2.5836079178786857,
            "bigbench_language_identification": 4.239091420173645,
            "bigbench_conceptual_combinations": 0.7603130716721989,
            "bigbench_elementary_math_qa": 3.997741335630417,
            "bigbench_dyck_languages": 4.239331330321416,
            "agi_eval_lsat_ar": 5.275849161977353,
            "bigbench_cs_algorithms": 7.5555061062177025,
            "bigbench_logical_deduction": 0.9823949654897054,
            "bigbench_operators": 6.1154031004224505,
            "bigbench_repeat_copy_logic": 2.070330873131752,
            "simple_arithmetic_nospaces": 7.524639223303113,
            "simple_arithmetic_withspaces": 6.486660767002266,
            "math_qa": 4.847928883632024,
            "logi_qa": 4.677567596625988,
            "pubmed_qa_labeled": 4.2057037518805815,
            "squad": 2.437259726723035,
            "agi_eval_lsat_rc": 4.891376952626812,
            "agi_eval_lsat_lr": 5.076623721216239,
            "coqa": 2.4715082118908565,
            "bigbench_understanding_fables": 5.071790410097314,
            "boolq": 3.2859223425388335,
            "agi_eval_sat_en": 4.884535784860259,
            "winogender_mc_female": 1.964106371005376,
            "winogender_mc_male": 2.065447837114334,
            "enterprise_pii_classification": 3.51590429743131,
            "bbq": 0.5049367276330788,
            "human_eval_return_complex": 1.4336133650907381,
            "human_eval_return_simple": 3.994417383864119,
            "human_eval-0.5": 1.2887093882735183,
            "human_eval-0.25": 1.3214689769395969,
            "human_eval-0.75": 1.3285452469093044,
            "human_eval": 1.3882434026497166,
            "processed_human_eval_cpp": 1.1788723698313932,
            "processed_human_eval_js": 1.0840504114947669
        },
        "icl": {
            "mmlu_zeroshot": 0.2323835024185348,
            "mmlu": 0.24074983583730564,
            "hellaswag_zeroshot": 0.3175662159919739,
            "jeopardy": 0.012816845159977674,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.004081632476300001,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.41090497374534607,
            "arc_easy": 0.4377104341983795,
            "arc_challenge": 0.23293516039848328,
            "bigbench_misconceptions": 0.5068492889404297,
            "copa": 0.6100000143051147,
            "siqa": 0.4969293773174286,
            "commonsense_qa": 0.25470924377441406,
            "piqa": 0.6180630922317505,
            "openbook_qa": 0.28200000524520874,
            "bigbench_novel_concepts": 0.34375,
            "bigbench_strange_stories": 0.48275861144065857,
            "bigbench_strategy_qa": 0.5107033848762512,
            "lambada_openai": 0.3578498065471649,
            "hellaswag": 0.3168691396713257,
            "winograd": 0.5897436141967773,
            "winogrande": 0.5003946423530579,
            "bigbench_conlang_translation": 0.012195121496915817,
            "bigbench_language_identification": 0.25270000100135803,
            "bigbench_conceptual_combinations": 0.291262149810791,
            "bigbench_elementary_math_qa": 0.25513628125190735,
            "bigbench_dyck_languages": 0.19099999964237213,
            "agi_eval_lsat_ar": 0.19130434095859528,
            "bigbench_cs_algorithms": 0.4363636374473572,
            "bigbench_logical_deduction": 0.24799999594688416,
            "bigbench_operators": 0.12857143580913544,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.003000000026077032,
            "simple_arithmetic_withspaces": 0.004000000189989805,
            "math_qa": 0.25209519267082214,
            "logi_qa": 0.2626728117465973,
            "pubmed_qa_labeled": 0.5040000081062317,
            "squad": 0.036991484463214874,
            "agi_eval_lsat_rc": 0.23134328424930573,
            "agi_eval_lsat_lr": 0.28823530673980713,
            "coqa": 0.11900287866592407,
            "bigbench_understanding_fables": 0.28042328357696533,
            "boolq": 0.5055046081542969,
            "agi_eval_sat_en": 0.223300963640213,
            "winogender_mc_female": 0.46666666865348816,
            "winogender_mc_male": 0.4833333194255829,
            "enterprise_pii_classification": 0.5278350710868835,
            "bbq": 0.501987874507904
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.1251068264245987,
        "language understanding": 0.09852309788887699,
        "reading comprehension": 0.08775925481071074,
        "safety": -0.010088533163070679,
        "symbolic problem solving": 0.06578259176264206,
        "world knowledge": 0.07868237173994383
    },
    "aggregated_centered_results": 0.07974425785687042,
    "aggregated_results": 0.2835602613357998,
    "rw_small": 0.43401851256688434,
    "95%_CI_above": 0.31641709576981764,
    "99%_CI_above": 0.3032854660861194,
    "model_uuid": "226d1bca-1c90-4168-8481-13ecb6fbdea1",
    "low_variance_datasets": 0.30922734427096493
}