{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "63b75053-a2d9-4b4d-accd-1ab16082457f",
    "model": "open_lm_1b",
    "creation_date": "2024_03_31-22_49_14",
    "eval_metrics": {
        "perplexity": 2.6472478826840717,
        "downstream_perpexity": {
            "mmlu": 1.697989415738712,
            "hellaswag": 2.318749326677632,
            "jeopardy_all": 1.6692385997871324,
            "triviaqa_sm_sub": 2.370636707365513,
            "gsm8k": 1.7594999545808088,
            "agi_eval_sat_math": 1.5438968414610081,
            "aqua": 2.243176388253971,
            "svamp": 2.7100898377100626,
            "bigbench_qa_wikidata": 3.8748995396277666,
            "arc_easy": 2.331335552362882,
            "arc_challenge": 2.4650924123899927,
            "bigbench_misconceptions": 4.838275452182717,
            "copa": 2.4754351890087127,
            "siqa": 1.5095985807562313,
            "commonsense_qa": 1.9165390176327868,
            "piqa": 2.6947213757673727,
            "openbook_qa": 4.050053576469422,
            "bigbench_novel_concepts": 2.3765948712825775,
            "bigbench_strange_stories": 3.081732012074569,
            "bigbench_strategy_qa": 1.8256123661213746,
            "lambada_openai": 1.3912467899246659,
            "winograd_wsc": 2.4858498940101037,
            "winogrande": 3.0509346720733883,
            "bigbench_conlang_translation": 1.9436439362967886,
            "bigbench_language_identification": 4.137074087113946,
            "bigbench_conceptual_combinations": 0.9190020196646163,
            "bigbench_elementary_math_qa": 3.4943057939519546,
            "bigbench_dyck_languages": 4.018090147256851,
            "agi_eval_lsat_ar": 1.7621901906054953,
            "bigbench_cs_algorithms": 5.5409375389417015,
            "bigbench_logical_deduction": 1.0829541920423507,
            "bigbench_operators": 4.759257886523292,
            "bigbench_repeat_copy_logic": 1.3572513274848461,
            "simple_arithmetic_nospaces": 6.539688305854797,
            "simple_arithmetic_withspaces": 6.041264312744141,
            "math_qa": 4.268579270509598,
            "logi_qa": 1.8428037637206633,
            "pubmed_qa_labeled": 4.3489901471138,
            "squad": 2.2317726852201356,
            "agi_eval_lsat_rc": 1.694901292893424,
            "agi_eval_lsat_lr": 1.732560002336315,
            "coqa": 2.6420854824042985,
            "bigbench_understanding_fables": 3.7032776005053645,
            "boolq": 2.8728272047976104,
            "agi_eval_sat_en": 1.697437381281436,
            "winogender_mc_female": 1.0211677819490432,
            "winogender_mc_male": 1.015799480676651,
            "enterprise_pii_classification": 5.492012695570933,
            "bbq": 0.33525174835085786,
            "human_eval_return_complex": 2.8876658950264997,
            "human_eval_return_simple": 5.901818494539003,
            "human_eval-0.5": 2.8601159242595116,
            "human_eval-0.25": 3.044461341892801,
            "human_eval-0.75": 2.9173327030205147,
            "human_eval": 3.3251365524966543,
            "processed_human_eval_cpp": 3.2461167966356927,
            "processed_human_eval_js": 2.6635974885486977
        },
        "icl": {
            "mmlu_zeroshot": 0.25353739528279556,
            "hellaswag_zeroshot": 0.5952997207641602,
            "jeopardy": 0.25903665125370023,
            "triviaqa_sm_sub": 0.21133333444595337,
            "gsm8k": 0.004548900760710239,
            "agi_eval_sat_math": 0.022727273404598236,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.5919492244720459,
            "arc_easy": 0.6700336933135986,
            "arc_challenge": 0.37030717730522156,
            "bigbench_misconceptions": 0.4885844886302948,
            "copa": 0.7599999904632568,
            "siqa": 0.4923234283924103,
            "commonsense_qa": 0.21949222683906555,
            "piqa": 0.7431991100311279,
            "openbook_qa": 0.38199999928474426,
            "bigbench_novel_concepts": 0.46875,
            "bigbench_strange_stories": 0.545976996421814,
            "bigbench_strategy_qa": 0.5168195962905884,
            "lambada_openai": 0.5670483112335205,
            "hellaswag": 0.5993825793266296,
            "winograd": 0.7765567898750305,
            "winogrande": 0.5848460793495178,
            "bigbench_conlang_translation": 0.012195121496915817,
            "bigbench_language_identification": 0.25780001282691956,
            "bigbench_conceptual_combinations": 0.24271844327449799,
            "bigbench_elementary_math_qa": 0.24261006712913513,
            "bigbench_dyck_languages": 0.2070000022649765,
            "agi_eval_lsat_ar": 0.2652173936367035,
            "bigbench_cs_algorithms": 0.36212122440338135,
            "bigbench_logical_deduction": 0.23733332753181458,
            "bigbench_operators": 0.18571428954601288,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.004999999888241291,
            "simple_arithmetic_withspaces": 0.007000000216066837,
            "math_qa": 0.25879988074302673,
            "logi_qa": 0.2626728117465973,
            "pubmed_qa_labeled": 0.24500000476837158,
            "squad": 0.382497638463974,
            "agi_eval_lsat_rc": 0.26865673065185547,
            "agi_eval_lsat_lr": 0.24705882370471954,
            "coqa": 0.30276837944984436,
            "bigbench_understanding_fables": 0.21693122386932373,
            "boolq": 0.5941895842552185,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.4333333373069763,
            "winogender_mc_male": 0.5166666507720947,
            "enterprise_pii_classification": 0.48807069659233093,
            "bbq": 0.48145785385912115,
            "gpqa_main": 0.2321428507566452,
            "gpqa_diamond": 0.21212121844291687,
            "mmlu_fewshot": 0.2579803338699173,
            "gsm8k_cot": 0.007581501267850399,
            "agi_eval_sat_math_cot": 0.022727273404598236,
            "aqua_cot": 0.008163264952600002,
            "svamp_cot": 0.05666666850447655
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.23839498484281005,
        "language understanding": 0.3002602324515737,
        "reading comprehension": 0.13701419062553005,
        "safety": -0.040235730734738434,
        "symbolic problem solving": 0.07379252110442071,
        "world knowledge": 0.17009850793937492
    },
    "aggregated_centered_results": 0.15192991012038523,
    "aggregated_results": 0.3380687600896218,
    "rw_small": 0.5957793742418289,
    "rw_small_centered": 0.3004373524621216,
    "95%_CI_above": 0.4330053382879728,
    "95%_CI_above_centered": 0.24935321111374112,
    "99%_CI_above": 0.44137959856054054,
    "99%_CI_above_centered": 0.28996276075516286,
    "low_variance_datasets": 0.43983909447084774,
    "low_variance_datasets_centered": 0.30143183924371447,
    "model_uuid": "15bdc463-5dcf-4b39-8652-7c79709623df",
    "_filename": "exp_data/evals/evaluation_rw_v2_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000avg=poly_64_heavy.json",
    "missing tasks": "[]",
    "Core": 0.30143183924371447,
    "Extended": 0.15192991012038523
}