{
    "name": "/<scrub>/task_runtime/scrub/eval/heavy",
    "uuid": "56f28238-e575-43bd-afd0-c5989b699f9a",
    "model": "d=1024_l=24_h=8",
    "creation_date": "2024_01_29-16_02_29",
    "eval_metrics": {
        "perplexity": 3.6106398661931354,
        "downstream_perpexity": {
            "mmlu": 4.540236081679662,
            "hellaswag": 2.677099186182022,
            "jeopardy_all": 3.5005744129419325,
            "triviaqa_sm_sub": 3.6496493568023047,
            "gsm8k": 2.7947603245576222,
            "agi_eval_sat_math": 3.079992071065036,
            "aqua": 3.3340988246762024,
            "svamp": 3.5853687659899394,
            "bigbench_qa_wikidata": 4.817022035519282,
            "arc_easy": 3.4397936155398687,
            "arc_challenge": 3.39310293396314,
            "bigbench_misconceptions": 5.798261779628388,
            "copa": 2.934760479927063,
            "siqa": 4.730074417591095,
            "commonsense_qa": 4.883483783404032,
            "piqa": 2.8933808147907256,
            "openbook_qa": 4.723772603034973,
            "bigbench_novel_concepts": 3.511775463819504,
            "bigbench_strange_stories": 3.7253632764706666,
            "bigbench_strategy_qa": 2.210476859410604,
            "lambada_openai": 2.3435602116088075,
            "winograd_wsc": 2.8916019192545406,
            "winogrande": 3.351736368735631,
            "bigbench_conlang_translation": 2.491443848464547,
            "bigbench_language_identification": 4.590561648209889,
            "bigbench_conceptual_combinations": 0.6902308128412488,
            "bigbench_elementary_math_qa": 3.6279515107472737,
            "bigbench_dyck_languages": 6.387578775902756,
            "agi_eval_lsat_ar": 4.342318458142488,
            "bigbench_cs_algorithms": 5.202423560619354,
            "bigbench_logical_deduction": 0.8821069493889808,
            "bigbench_operators": 6.029962028775897,
            "bigbench_repeat_copy_logic": 2.1998035684227943,
            "simple_arithmetic_nospaces": 7.524716262556925,
            "simple_arithmetic_withspaces": 6.899095140585379,
            "math_qa": 4.429006818930308,
            "logi_qa": 3.761914183284097,
            "pubmed_qa_labeled": 7.60196071412383,
            "squad": 2.473686625560125,
            "agi_eval_lsat_rc": 4.386749585173023,
            "agi_eval_lsat_lr": 4.392514789805693,
            "coqa": 2.729225657383601,
            "bigbench_understanding_fables": 4.636549899187037,
            "boolq": 4.224774058659872,
            "agi_eval_sat_en": 4.592628799595879,
            "winogender_mc_female": 2.540430905421575,
            "winogender_mc_male": 2.454490574200948,
            "enterprise_pii_classification": 5.412275882562001,
            "bbq": 0.5550091471523046,
            "human_eval_return_complex": 4.171705172756526,
            "human_eval_return_simple": 6.445865064053922,
            "human_eval-0.5": 4.005032729811784,
            "human_eval-0.25": 4.099463997817621,
            "human_eval-0.75": 4.222971711216903,
            "human_eval": 4.221120033322311,
            "processed_human_eval_cpp": 4.537510303236683,
            "processed_human_eval_js": 4.213985021521405
        },
        "icl": {
            "mmlu_zeroshot": 0.23440818279458767,
            "mmlu": 0.24564419085519357,
            "hellaswag_zeroshot": 0.38428598642349243,
            "jeopardy": 0.0,
            "triviaqa_sm_sub": 0.0006666666595265269,
            "gsm8k": 0.0007581501267850399,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.437773734331131,
            "arc_easy": 0.4234006702899933,
            "arc_challenge": 0.23122866451740265,
            "bigbench_misconceptions": 0.456620991230011,
            "copa": 0.6899999976158142,
            "siqa": 0.48669394850730896,
            "commonsense_qa": 0.2153972089290619,
            "piqa": 0.6844395995140076,
            "openbook_qa": 0.3019999861717224,
            "bigbench_novel_concepts": 0.1875,
            "bigbench_strange_stories": 0.5114942789077759,
            "bigbench_strategy_qa": 0.4914810061454773,
            "lambada_openai": 0.34271296858787537,
            "hellaswag": 0.3829914331436157,
            "winograd": 0.6007326245307922,
            "winogrande": 0.5319652557373047,
            "bigbench_conlang_translation": 0.012195121496915817,
            "bigbench_language_identification": 0.25130000710487366,
            "bigbench_conceptual_combinations": 0.3009708821773529,
            "bigbench_elementary_math_qa": 0.25151991844177246,
            "bigbench_dyck_languages": 0.12800000607967377,
            "agi_eval_lsat_ar": 0.22173912823200226,
            "bigbench_cs_algorithms": 0.4280303120613098,
            "bigbench_logical_deduction": 0.2526666522026062,
            "bigbench_operators": 0.1428571492433548,
            "bigbench_repeat_copy_logic": 0.03125,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.004999999888241291,
            "math_qa": 0.25879988074302673,
            "logi_qa": 0.24884793162345886,
            "pubmed_qa_labeled": 0.4269999861717224,
            "squad": 0.011542100459337234,
            "agi_eval_lsat_rc": 0.26865673065185547,
            "agi_eval_lsat_lr": 0.2803921699523926,
            "coqa": 0.11574596166610718,
            "bigbench_understanding_fables": 0.29100528359413147,
            "boolq": 0.5850152969360352,
            "agi_eval_sat_en": 0.23786407709121704,
            "winogender_mc_female": 0.4166666567325592,
            "winogender_mc_male": 0.4833333194255829,
            "enterprise_pii_classification": 0.5030927658081055,
            "bbq": 0.4832503714344718
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.12226007382074992,
        "language understanding": 0.13079603280251223,
        "reading comprehension": 0.10352620715275407,
        "safety": -0.05682844329964032,
        "symbolic problem solving": 0.06519729182538059,
        "world knowledge": 0.06640724934086446
    },
    "aggregated_centered_results": 0.08106984527680769,
    "aggregated_results": 0.2839898570321276,
    "rw_small": 0.47431597858667374,
    "95%_CI_above": 0.3178003059319136,
    "99%_CI_above": 0.31287835946306586,
    "model_uuid": "036aa198-4da2-4e15-8a8f-1fe2b8b00c6a",
    "low_variance_datasets": 0.3246549132534049
}