{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "cdb45c1a-aa09-4f65-9796-a8009707977f",
    "model": "open_lm_7b",
    "creation_date": "2024_04_07-15_32_14",
    "eval_metrics": {
        "perplexity": 2.119118694464366,
        "downstream_perpexity": {
            "mmlu": 1.6789379902942756,
            "hellaswag": 2.1318036386202683,
            "jeopardy_all": 1.006128338314716,
            "triviaqa_sm_sub": 1.56289792006587,
            "gsm8k": 1.5475449154465555,
            "agi_eval_sat_math": 1.3628604390404442,
            "aqua": 2.0226530722209386,
            "svamp": 2.3984528676668804,
            "bigbench_qa_wikidata": 3.065953903891992,
            "arc_easy": 1.9889754758840459,
            "arc_challenge": 2.1554797170206954,
            "bigbench_misconceptions": 2.3572249662930562,
            "copa": 2.2696435022354127,
            "siqa": 1.5305913653198016,
            "commonsense_qa": 1.78743018936076,
            "piqa": 2.313618339508999,
            "openbook_qa": 3.8353186230659486,
            "bigbench_novel_concepts": 2.2578517124056816,
            "bigbench_strange_stories": 2.552180138812668,
            "bigbench_strategy_qa": 1.6660589347400223,
            "lambada_openai": 0.962191708668251,
            "winograd_wsc": 2.31883970896403,
            "winogrande": 2.9189954483330296,
            "bigbench_conlang_translation": 1.5789105645040187,
            "bigbench_language_identification": 1.522443570180234,
            "bigbench_conceptual_combinations": 0.9642182901067641,
            "bigbench_elementary_math_qa": 4.062073056221758,
            "bigbench_dyck_languages": 3.1177836768627167,
            "agi_eval_lsat_ar": 1.7481618653173032,
            "bigbench_cs_algorithms": 2.6316004897608902,
            "bigbench_logical_deduction": 1.2548349142471948,
            "bigbench_operators": 4.568725154513404,
            "bigbench_repeat_copy_logic": 1.0694170445203781,
            "simple_arithmetic_nospaces": 6.13849173283577,
            "simple_arithmetic_withspaces": 5.172511415958405,
            "math_qa": 1.5805834457852719,
            "logi_qa": 1.9461561203369164,
            "pubmed_qa_labeled": 4.469590000629425,
            "squad": 1.5160685896577424,
            "agi_eval_lsat_rc": 1.6061788534940178,
            "agi_eval_lsat_lr": 1.7383933193543377,
            "coqa": 1.1575645692669836,
            "bigbench_understanding_fables": 1.5191783980717735,
            "boolq": 3.28903179073917,
            "agi_eval_sat_en": 1.6002823183837447,
            "winogender_mc_female": 0.8637628485759099,
            "winogender_mc_male": 0.7027225295702616,
            "enterprise_pii_classification": 3.5221201225303234,
            "bbq": 0.2949154123815864,
            "human_eval_return_complex": 1.0642278415011608,
            "human_eval_return_simple": 2.390305258132316,
            "human_eval-0.5": 0.9894874982717561,
            "human_eval-0.25": 1.013277646971912,
            "human_eval-0.75": 1.0306097485306787,
            "human_eval": 1.0757178776874774,
            "processed_human_eval_cpp": 1.1761179266509063,
            "processed_human_eval_js": 1.1386373530073863
        },
        "icl": {
            "mmlu_zeroshot": 0.2630054161213992,
            "hellaswag_zeroshot": 0.7239593863487244,
            "jeopardy": 0.4411972463130951,
            "triviaqa_sm_sub": 0.4230000078678131,
            "gsm8k": 0.011372251436114311,
            "agi_eval_sat_math": 0.013636363670229912,
            "aqua": 0.004081632476300001,
            "bigbench_qa_wikidata": 0.6922887563705444,
            "arc_easy": 0.7146464586257935,
            "arc_challenge": 0.43856656551361084,
            "bigbench_misconceptions": 0.5251141786575317,
            "copa": 0.8399999737739563,
            "siqa": 0.5076765418052673,
            "commonsense_qa": 0.22522522509098053,
            "piqa": 0.7916213274002075,
            "openbook_qa": 0.4180000126361847,
            "bigbench_novel_concepts": 0.5625,
            "bigbench_strange_stories": 0.6436781883239746,
            "bigbench_strategy_qa": 0.5731760859489441,
            "lambada_openai": 0.6937706470489502,
            "hellaswag": 0.7339175343513489,
            "winograd": 0.8241758346557617,
            "winogrande": 0.6685082912445068,
            "bigbench_conlang_translation": 0.012195121496915817,
            "bigbench_language_identification": 0.2502000033855438,
            "bigbench_conceptual_combinations": 0.28155338764190674,
            "bigbench_elementary_math_qa": 0.24287211894989014,
            "bigbench_dyck_languages": 0.17900000512599945,
            "agi_eval_lsat_ar": 0.27391305565834045,
            "bigbench_cs_algorithms": 0.4598484933376312,
            "bigbench_logical_deduction": 0.26600000262260437,
            "bigbench_operators": 0.24761904776096344,
            "bigbench_repeat_copy_logic": 0.1875,
            "simple_arithmetic_nospaces": 0.01600000075995922,
            "simple_arithmetic_withspaces": 0.017999999225139618,
            "math_qa": 0.24807241559028625,
            "logi_qa": 0.28264209628105164,
            "pubmed_qa_labeled": 0.2409999966621399,
            "squad": 0.4775780439376831,
            "agi_eval_lsat_rc": 0.24626865983009338,
            "agi_eval_lsat_lr": 0.24901960790157318,
            "coqa": 0.40285608172416687,
            "bigbench_understanding_fables": 0.26455026865005493,
            "boolq": 0.691437304019928,
            "agi_eval_sat_en": 0.25242719054222107,
            "winogender_mc_female": 0.5,
            "winogender_mc_male": 0.4833333194255829,
            "enterprise_pii_classification": 0.5036818981170654,
            "bbq": 0.4330042979933999,
            "gpqa_main": 0.2299107164144516,
            "gpqa_diamond": 0.22727273404598236,
            "mmlu_fewshot": 0.279204527537028,
            "gsm8k_cot": 0.015163002535700798,
            "agi_eval_sat_math_cot": 0.0181818176060915,
            "aqua_cot": 0.008163264952600002,
            "svamp_cot": 0.06666667014360428
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.3334364462348315,
        "language understanding": 0.3982139936181111,
        "reading comprehension": 0.18905768152162955,
        "safety": -0.03999024223197589,
        "symbolic problem solving": 0.1014688155056747,
        "world knowledge": 0.2476856258877537
    },
    "aggregated_centered_results": 0.21140740553832701,
    "aggregated_results": 0.3822483552447961,
    "rw_small": 0.6795196632544199,
    "rw_small_centered": 0.4513176637783385,
    "95%_CI_above": 0.5001138233422459,
    "95%_CI_above_centered": 0.33884084141912724,
    "99%_CI_above": 0.5161004863355471,
    "99%_CI_above_centered": 0.38968124644423063,
    "low_variance_datasets": 0.5170831497419964,
    "low_variance_datasets_centered": 0.404065563308949,
    "model_uuid": "3d61485f-5de3-4245-b462-b4c6a9c462a2",
    "_filename": "exp_data/evals/evaluation_rw_v2_cc_v3_f0.15_resiliparse_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.2-open_lm_7b-warm=5000-lr=0p0003-wd=0p33-cd=3e-05-bs=2048-mult=1-seed=124-tokens=137788211200_heavy.json",
    "missing tasks": "[]",
    "Core": 0.404065563308949,
    "Extended": 0.21140740553832701
}