{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "11b2a889-f6ac-4141-b2aa-cc19f921cc7f",
    "model": "open_lm_1b",
    "creation_date": "2024_02_15-13_50_33",
    "eval_metrics": {
        "perplexity": 2.2233589907487232,
        "downstream_perpexity": {
            "mmlu": 2.0651818245649336,
            "hellaswag": 2.5714266136288644,
            "jeopardy_all": 2.22719559147954,
            "triviaqa_sm_sub": 2.613338910788298,
            "gsm8k": 2.1096173360943795,
            "agi_eval_sat_math": 1.6425683720545334,
            "aqua": 2.513452050150657,
            "svamp": 2.3842894283930463,
            "bigbench_qa_wikidata": 4.156342931588491,
            "arc_easy": 2.894426533828179,
            "arc_challenge": 2.8694323827823003,
            "bigbench_misconceptions": 6.094671832916399,
            "copa": 2.728789412975311,
            "siqa": 1.525943383326133,
            "commonsense_qa": 1.8590563376744589,
            "piqa": 2.8196494872371356,
            "openbook_qa": 4.470730357646942,
            "bigbench_novel_concepts": 2.885425329208374,
            "bigbench_strange_stories": 3.4545517563819885,
            "bigbench_strategy_qa": 2.369422461092472,
            "lambada_openai": 1.5835935056597614,
            "winograd_wsc": 2.6632826125228797,
            "winogrande": 3.236966653664907,
            "bigbench_conlang_translation": 2.0871099611607993,
            "bigbench_language_identification": 3.220499974489212,
            "bigbench_conceptual_combinations": 0.9322400671764485,
            "bigbench_elementary_math_qa": 2.888030516107877,
            "bigbench_dyck_languages": 4.973727111021677,
            "agi_eval_lsat_ar": 1.9939680949501368,
            "bigbench_cs_algorithms": 5.897433803478877,
            "bigbench_logical_deduction": 1.001220705608527,
            "bigbench_operators": 5.498744396936326,
            "bigbench_repeat_copy_logic": 1.6487938687205315,
            "simple_arithmetic_nospaces": 7.254860277970632,
            "simple_arithmetic_withspaces": 5.964193544785181,
            "math_qa": 3.1711423685153326,
            "logi_qa": 2.0780169392511043,
            "pubmed_qa_labeled": 3.5559075435002643,
            "squad": 2.321383733302355,
            "agi_eval_lsat_rc": 1.9054813033609248,
            "agi_eval_lsat_lr": 1.8875836073183545,
            "coqa": 1.8595943316817283,
            "bigbench_understanding_fables": 2.1048489954105762,
            "boolq": 2.31228845367829,
            "agi_eval_sat_en": 1.884699127049122,
            "winogender_mc_female": 1.3076320866743723,
            "winogender_mc_male": 1.4203659454981485,
            "enterprise_pii_classification": 3.701283309857051,
            "bbq": 0.3209469104806582,
            "human_eval_return_complex": 1.0107985716166459,
            "human_eval_return_simple": 3.4113156859939164,
            "human_eval-0.5": 0.8961125808518108,
            "human_eval-0.25": 0.9378209993606661,
            "human_eval-0.75": 0.8994561275694428,
            "human_eval": 0.9940056451937047,
            "processed_human_eval_cpp": 0.8960354742796525,
            "processed_human_eval_js": 0.861517595263516
        },
        "icl": {
            "mmlu_zeroshot": 0.25444119488983824,
            "mmlu": 0.25145919908557024,
            "hellaswag_zeroshot": 0.45060744881629944,
            "jeopardy": 0.12897960729897023,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0007581501267850399,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.6013975739479065,
            "arc_easy": 0.5484007000923157,
            "arc_challenge": 0.27901023626327515,
            "bigbench_misconceptions": 0.49771690368652344,
            "copa": 0.6499999761581421,
            "siqa": 0.47799384593963623,
            "commonsense_qa": 0.21621622145175934,
            "piqa": 0.6882480978965759,
            "openbook_qa": 0.33799999952316284,
            "bigbench_novel_concepts": 0.34375,
            "bigbench_strange_stories": 0.540229856967926,
            "bigbench_strategy_qa": 0.5268676280975342,
            "lambada_openai": 0.5229963064193726,
            "hellaswag": 0.45389366149902344,
            "winograd": 0.7032967209815979,
            "winogrande": 0.5311760306358337,
            "bigbench_conlang_translation": 0.012195121496915817,
            "bigbench_language_identification": 0.2529999911785126,
            "bigbench_conceptual_combinations": 0.223300963640213,
            "bigbench_elementary_math_qa": 0.24848008155822754,
            "bigbench_dyck_languages": 0.2070000022649765,
            "agi_eval_lsat_ar": 0.22173912823200226,
            "bigbench_cs_algorithms": 0.44090908765792847,
            "bigbench_logical_deduction": 0.2513333261013031,
            "bigbench_operators": 0.190476194024086,
            "bigbench_repeat_copy_logic": 0.0625,
            "simple_arithmetic_nospaces": 0.003000000026077032,
            "simple_arithmetic_withspaces": 0.004000000189989805,
            "math_qa": 0.2507542669773102,
            "logi_qa": 0.22580644488334656,
            "pubmed_qa_labeled": 0.5519999861717224,
            "squad": 0.33604541420936584,
            "agi_eval_lsat_rc": 0.25746268033981323,
            "agi_eval_lsat_lr": 0.26274511218070984,
            "coqa": 0.25078290700912476,
            "bigbench_understanding_fables": 0.24338623881340027,
            "boolq": 0.5394495129585266,
            "agi_eval_sat_en": 0.276699036359787,
            "winogender_mc_female": 0.44999998807907104,
            "winogender_mc_male": 0.5166666507720947,
            "enterprise_pii_classification": 0.49455082416534424,
            "bbq": 0.49639683961868286
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.16424589107433957,
        "language understanding": 0.1889841272495687,
        "reading comprehension": 0.15893142794569334,
        "safety": -0.021192848682403564,
        "symbolic problem solving": 0.07356178953001896,
        "world knowledge": 0.14631992863221074
    },
    "aggregated_centered_results": 0.12326483230265037,
    "aggregated_results": 0.3192352538513307,
    "rw_small": 0.49977398415406543,
    "95%_CI_above": 0.37415185546812907,
    "99%_CI_above": 0.37633735151216385,
    "model_uuid": "affe7ca4-aee5-4b58-bdeb-01234f3722ad",
    "low_variance_datasets": 0.39155112811448894
}