{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "143c1184-ea9f-477c-aa05-b59440b0469b",
    "model": "d=96_l=8_h=4",
    "creation_date": "2024_02_09-21_19_37",
    "eval_metrics": {
        "perplexity": 5.078595785299937,
        "downstream_perpexity": {
            "mmlu": 5.084420720736186,
            "hellaswag": 4.939518161614736,
            "jeopardy_all": 6.946809267997741,
            "triviaqa_sm_sub": 6.746449049313863,
            "gsm8k": 5.396116264661154,
            "agi_eval_sat_math": 4.024276739900762,
            "aqua": 5.233903437244649,
            "svamp": 5.132066961924235,
            "bigbench_qa_wikidata": 9.881168969472249,
            "arc_easy": 6.005696642398834,
            "arc_challenge": 5.64430718421936,
            "bigbench_misconceptions": 5.631560754558267,
            "copa": 5.1195539426803585,
            "siqa": 4.838661468029022,
            "commonsense_qa": 5.363481060663859,
            "piqa": 5.46604110399882,
            "openbook_qa": 6.8735518054962155,
            "bigbench_novel_concepts": 5.57464987039566,
            "bigbench_strange_stories": 6.26177419739208,
            "bigbench_strategy_qa": 5.188287031650543,
            "lambada_openai": 7.384852806727092,
            "winograd_wsc": 4.527097351821787,
            "winogrande": 4.8865678131580355,
            "bigbench_conlang_translation": 5.991220488780883,
            "bigbench_language_identification": 5.002842847506205,
            "bigbench_conceptual_combinations": 5.368881267251321,
            "bigbench_elementary_math_qa": 6.311301112174988,
            "bigbench_dyck_languages": 5.741776057652065,
            "agi_eval_lsat_ar": 4.894238660646521,
            "bigbench_cs_algorithms": 9.226118628184,
            "bigbench_logical_deduction": 3.8478539367516835,
            "bigbench_operators": 6.360323676608858,
            "bigbench_repeat_copy_logic": 5.99883633852005,
            "simple_arithmetic_nospaces": 8.198949417647194,
            "simple_arithmetic_withspaces": 7.6457058276448935,
            "math_qa": 4.355788095792135,
            "logi_qa": 5.183397427132602,
            "pubmed_qa_labeled": 10.572350583156618,
            "squad": 6.430107239882151,
            "agi_eval_lsat_rc": 5.238696099217258,
            "agi_eval_lsat_lr": 5.0436007078956155,
            "coqa": 6.7285862763722735,
            "bigbench_understanding_fables": 4.865934952226265,
            "boolq": 8.031130421161652,
            "agi_eval_sat_en": 5.261384382988643,
            "winogender_mc_female": 6.85791240533193,
            "winogender_mc_male": 6.650048081080119,
            "enterprise_pii_classification": 6.443713486194611,
            "bbq": 5.406757255395253,
            "human_eval_return_complex": 4.116443831150926,
            "human_eval_return_simple": 5.767660398741026,
            "human_eval-0.5": 4.0416764544277655,
            "human_eval-0.25": 4.050919872958485,
            "human_eval-0.75": 4.0982970333680875,
            "human_eval": 4.1135259634110986,
            "processed_human_eval_cpp": 3.7496105928598724,
            "processed_human_eval_js": 3.0722571989385092
        },
        "icl": {
            "mmlu_zeroshot": 0.23159354820586087,
            "mmlu": 0.23172179491896377,
            "hellaswag_zeroshot": 0.24088826775550842,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.004081632476300001,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.0,
            "arc_easy": 0.2752525210380554,
            "arc_challenge": 0.2081911265850067,
            "bigbench_misconceptions": 0.5251141786575317,
            "copa": 0.4399999976158142,
            "siqa": 0.48413512110710144,
            "commonsense_qa": 0.1957411915063858,
            "piqa": 0.5353645086288452,
            "openbook_qa": 0.21799999475479126,
            "bigbench_novel_concepts": 0.21875,
            "bigbench_strange_stories": 0.5057471394538879,
            "bigbench_strategy_qa": 0.4814329445362091,
            "lambada_openai": 0.0017465554410591722,
            "hellaswag": 0.23879705369472504,
            "winograd": 0.49450549483299255,
            "winogrande": 0.5153906941413879,
            "bigbench_conlang_translation": 0.0,
            "bigbench_language_identification": 0.25,
            "bigbench_conceptual_combinations": 0.291262149810791,
            "bigbench_elementary_math_qa": 0.2719077467918396,
            "bigbench_dyck_languages": 0.0,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.012121211737394333,
            "bigbench_logical_deduction": 0.2626666724681854,
            "bigbench_operators": 0.06666667014360428,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0020000000949949026,
            "simple_arithmetic_withspaces": 0.004999999888241291,
            "math_qa": 0.2467314749956131,
            "logi_qa": 0.20276497304439545,
            "pubmed_qa_labeled": 0.0,
            "squad": 0.0,
            "agi_eval_lsat_rc": 0.2761194109916687,
            "agi_eval_lsat_lr": 0.27843138575553894,
            "coqa": 0.0001252661895705387,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.3785932660102844,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.5166666507720947,
            "winogender_mc_male": 0.4333333373069763,
            "enterprise_pii_classification": 0.5063328146934509,
            "bbq": 0.4342402355237441
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.037085125843683876,
        "language understanding": 0.006183611883898266,
        "reading comprehension": -0.011205869779587372,
        "safety": -0.054713480851866975,
        "symbolic problem solving": 0.01995087229232821,
        "world knowledge": -0.002553290211415528
    },
    "aggregated_centered_results": 0.006205603585688024,
    "aggregated_results": 0.22103162577373694,
    "rw_small": 0.34305264552434284,
    "95%_CI_above": 0.20607512989756124,
    "99%_CI_above": 0.18215214376046787,
    "model_uuid": "cf386712-f2a5-4b28-8d13-97ce6d3357c3",
    "low_variance_datasets": 0.1953395808781021
}