{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "4eb41a68-89fc-48a1-927a-8adfa3ba8be4",
    "model": "d=576_l=24_h=8",
    "creation_date": "2024_01_30-00_00_33",
    "eval_metrics": {
        "perplexity": 3.7250176548957823,
        "downstream_perpexity": {
            "mmlu": 5.096542596817017,
            "hellaswag": 3.9192254066467287,
            "jeopardy_all": 5.487928255399068,
            "triviaqa_sm_sub": 5.707080686092377,
            "gsm8k": 3.9690631767114004,
            "agi_eval_sat_math": 2.7437205878171054,
            "aqua": 4.04759590285165,
            "svamp": 4.389272530873616,
            "bigbench_qa_wikidata": 7.934249770641327,
            "arc_easy": 4.97201331059138,
            "arc_challenge": 4.648598162333171,
            "bigbench_misconceptions": 5.377810147254979,
            "copa": 4.182152910232544,
            "siqa": 4.986958769957225,
            "commonsense_qa": 5.352666076024374,
            "piqa": 4.404519607623418,
            "openbook_qa": 5.99901135635376,
            "bigbench_novel_concepts": 4.609287202358246,
            "bigbench_strange_stories": 5.160260860947357,
            "bigbench_strategy_qa": 3.864188673098882,
            "lambada_openai": 4.614100847641627,
            "winograd_wsc": 3.777025451153626,
            "winogrande": 4.1588597615559895,
            "bigbench_conlang_translation": 3.704570414089575,
            "bigbench_language_identification": 4.096773503224055,
            "bigbench_conceptual_combinations": 1.888185806644773,
            "bigbench_elementary_math_qa": 5.5898089428742725,
            "bigbench_dyck_languages": 6.888049184775152,
            "agi_eval_lsat_ar": 5.534457142456718,
            "bigbench_cs_algorithms": 7.25568954149882,
            "bigbench_logical_deduction": 1.1855091601610184,
            "bigbench_operators": 6.246647591817947,
            "bigbench_repeat_copy_logic": 3.340232588350773,
            "simple_arithmetic_nospaces": 7.685661849855375,
            "simple_arithmetic_withspaces": 7.0334729346908444,
            "math_qa": 3.8727897981802624,
            "logi_qa": 5.4080915103127145,
            "pubmed_qa_labeled": 7.91973128188558,
            "squad": 4.198209834098816,
            "agi_eval_lsat_rc": 5.290396290038949,
            "agi_eval_lsat_lr": 5.37778899529401,
            "coqa": 4.815384527047475,
            "bigbench_understanding_fables": 4.2291970606203435,
            "boolq": 6.565642019112905,
            "agi_eval_sat_en": 5.599248242609709,
            "winogender_mc_female": 4.960550610224406,
            "winogender_mc_male": 4.24883820215861,
            "enterprise_pii_classification": 5.907419391473135,
            "bbq": 1.4308846553166708,
            "human_eval_return_complex": 2.5759990684629424,
            "human_eval_return_simple": 4.540618432534708,
            "human_eval-0.5": 2.4874856232143028,
            "human_eval-0.25": 2.509707117226066,
            "human_eval-0.75": 2.5763125448692135,
            "human_eval": 2.6026630808667437,
            "processed_human_eval_cpp": 2.2081387072616483,
            "processed_human_eval_js": 1.8954215769360705
        },
        "icl": {
            "mmlu_zeroshot": 0.2318177361760223,
            "mmlu": 0.23763826633231683,
            "hellaswag_zeroshot": 0.25652259588241577,
            "jeopardy": 0.0005361930467188359,
            "triviaqa_sm_sub": 0.0006666666595265269,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.05777274817228317,
            "arc_easy": 0.2933501601219177,
            "arc_challenge": 0.22525596618652344,
            "bigbench_misconceptions": 0.5114155411720276,
            "copa": 0.46000000834465027,
            "siqa": 0.498976469039917,
            "commonsense_qa": 0.2719082832336426,
            "piqa": 0.5582154393196106,
            "openbook_qa": 0.23999999463558197,
            "bigbench_novel_concepts": 0.375,
            "bigbench_strange_stories": 0.4482758641242981,
            "bigbench_strategy_qa": 0.48012232780456543,
            "lambada_openai": 0.1061517596244812,
            "hellaswag": 0.25522804260253906,
            "winograd": 0.5347985625267029,
            "winogrande": 0.4838200509548187,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.2524999976158142,
            "bigbench_conceptual_combinations": 0.2330097109079361,
            "bigbench_elementary_math_qa": 0.2687107026576996,
            "bigbench_dyck_languages": 0.010999999940395355,
            "agi_eval_lsat_ar": 0.20434781908988953,
            "bigbench_cs_algorithms": 0.3954545557498932,
            "bigbench_logical_deduction": 0.24133333563804626,
            "bigbench_operators": 0.06666667014360428,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0010000000474974513,
            "simple_arithmetic_withspaces": 0.003000000026077032,
            "math_qa": 0.243043914437294,
            "logi_qa": 0.21658986806869507,
            "pubmed_qa_labeled": 0.27000001072883606,
            "squad": 0.0005676442524418235,
            "agi_eval_lsat_rc": 0.24626865983009338,
            "agi_eval_lsat_lr": 0.2921568751335144,
            "coqa": 0.012276086956262589,
            "bigbench_understanding_fables": 0.31216931343078613,
            "boolq": 0.38134557008743286,
            "agi_eval_sat_en": 0.291262149810791,
            "winogender_mc_female": 0.5666666626930237,
            "winogender_mc_male": 0.46666666865348816,
            "enterprise_pii_classification": 0.5063328146934509,
            "bbq": 0.4471492821520025
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.07671557242671648,
        "language understanding": 0.017467014491558075,
        "reading comprehension": 0.029334693298248265,
        "safety": -0.006592285904017364,
        "symbolic problem solving": 0.04303899514230174,
        "world knowledge": 0.00823619108061968
    },
    "aggregated_centered_results": 0.03280869713520778,
    "aggregated_results": 0.2443438518256284,
    "rw_small": 0.35004905859629315,
    "95%_CI_above": 0.24478073884090973,
    "99%_CI_above": 0.2236620390517637,
    "model_uuid": "1b90fa26-88ee-403e-996c-0f5f09d912e4",
    "low_variance_datasets": 0.23035082493125544
}