{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "2e38d618-d1e5-41dc-abde-799f0e665d91",
    "model": "open_lm_1b",
    "creation_date": "2024_02_11-20_58_16",
    "eval_metrics": {
        "perplexity": 2.5676534354686735,
        "downstream_perpexity": {
            "mmlu": 1.6891005248905264,
            "hellaswag": 2.35488909683577,
            "jeopardy_all": 1.545278757578312,
            "triviaqa_sm_sub": 2.2139556770424047,
            "gsm8k": 1.7544151728039352,
            "agi_eval_sat_math": 1.5361095119606365,
            "aqua": 2.2406328254816485,
            "svamp": 2.546994519233704,
            "bigbench_qa_wikidata": 3.35287206301034,
            "arc_easy": 2.3098046725446526,
            "arc_challenge": 2.4716928901831037,
            "bigbench_misconceptions": 3.9323105082664314,
            "copa": 2.496270549297333,
            "siqa": 1.4326494297083228,
            "commonsense_qa": 1.857463154706869,
            "piqa": 2.671247202372006,
            "openbook_qa": 4.140183201313019,
            "bigbench_novel_concepts": 2.4005865454673767,
            "bigbench_strange_stories": 3.15849864003302,
            "bigbench_strategy_qa": 1.87249360937591,
            "lambada_openai": 1.3650540030193956,
            "winograd_wsc": 2.484088184414329,
            "winogrande": 3.102969632152506,
            "bigbench_conlang_translation": 1.9438536937643842,
            "bigbench_language_identification": 3.376919722075366,
            "bigbench_conceptual_combinations": 1.0944252552338016,
            "bigbench_elementary_math_qa": 3.6399410032688695,
            "bigbench_dyck_languages": 4.676666002511978,
            "agi_eval_lsat_ar": 1.7815442489541096,
            "bigbench_cs_algorithms": 5.0734025962425005,
            "bigbench_logical_deduction": 1.0494073539177577,
            "bigbench_operators": 4.808074713888622,
            "bigbench_repeat_copy_logic": 1.4242343418300152,
            "simple_arithmetic_nospaces": 6.699007217407226,
            "simple_arithmetic_withspaces": 6.476122812271118,
            "math_qa": 3.450080469955025,
            "logi_qa": 1.8768677986162599,
            "pubmed_qa_labeled": 3.39150008392334,
            "squad": 2.191181993789005,
            "agi_eval_lsat_rc": 1.652946514424993,
            "agi_eval_lsat_lr": 1.7662893489295362,
            "coqa": 2.663274790124309,
            "bigbench_understanding_fables": 2.3337230341775075,
            "boolq": 2.834319068240828,
            "agi_eval_sat_en": 1.670616541093993,
            "winogender_mc_female": 1.1018253276745478,
            "winogender_mc_male": 0.9737814451257388,
            "enterprise_pii_classification": 5.567583414170458,
            "bbq": 0.29215878841120035,
            "human_eval_return_complex": 2.584959360558217,
            "human_eval_return_simple": 5.230131587466678,
            "human_eval-0.5": 2.558508766133611,
            "human_eval-0.25": 2.703192290736408,
            "human_eval-0.75": 2.611893744003482,
            "human_eval": 2.8760344604166543,
            "processed_human_eval_cpp": 2.8024757900593444,
            "processed_human_eval_js": 2.6757032900321773
        },
        "icl": {
            "mmlu_zeroshot": 0.2645844472082038,
            "hellaswag_zeroshot": 0.5709022283554077,
            "jeopardy": 0.3144381046295166,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.6208847761154175,
            "arc_easy": 0.6704545617103577,
            "arc_challenge": 0.3575085401535034,
            "bigbench_misconceptions": 0.4383561611175537,
            "copa": 0.6899999976158142,
            "siqa": 0.48515865206718445,
            "commonsense_qa": 0.2694512605667114,
            "piqa": 0.7431991100311279,
            "openbook_qa": 0.3779999911785126,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.5344827771186829,
            "bigbench_strategy_qa": 0.5072084069252014,
            "lambada_openai": 0.5862604379653931,
            "hellaswag": 0.5771758556365967,
            "winograd": 0.7802197933197021,
            "winogrande": 0.5832675695419312,
            "bigbench_conlang_translation": 0.024390242993831635,
            "bigbench_language_identification": 0.25529998540878296,
            "bigbench_conceptual_combinations": 0.3106796145439148,
            "bigbench_elementary_math_qa": 0.24764150381088257,
            "bigbench_dyck_languages": 0.20200000703334808,
            "agi_eval_lsat_ar": 0.2130434811115265,
            "bigbench_cs_algorithms": 0.4530302882194519,
            "bigbench_logical_deduction": 0.24266666173934937,
            "bigbench_operators": 0.20000000298023224,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.004000000189989805,
            "simple_arithmetic_withspaces": 0.003000000026077032,
            "math_qa": 0.2534361481666565,
            "logi_qa": 0.26728111505508423,
            "pubmed_qa_labeled": 0.5360000133514404,
            "squad": 0.3860927224159241,
            "agi_eval_lsat_rc": 0.25,
            "agi_eval_lsat_lr": 0.2803921699523926,
            "coqa": 0.3082800805568695,
            "bigbench_understanding_fables": 0.2380952388048172,
            "boolq": 0.6201834678649902,
            "agi_eval_sat_en": 0.223300963640213,
            "winogender_mc_female": 0.4166666567325592,
            "winogender_mc_male": 0.5666666626930237,
            "enterprise_pii_classification": 0.4506627321243286,
            "bbq": 0.5034658096053384,
            "mmlu_fewshot": 0.2533582671169649,
            "gsm8k_cot": 0.008339650928974152,
            "agi_eval_sat_math_cot": 0.013636363670229912,
            "aqua_cot": 0.004081632476300001,
            "svamp_cot": 0.046666666865348816,
            "gpqa_main": 0.2098214328289032,
            "gpqa_diamond": 0.19696970283985138
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.22629815376717885,
        "language understanding": 0.3079229662690734,
        "reading comprehension": 0.1757332715369238,
        "safety": -0.03126906942237506,
        "symbolic problem solving": 0.07513432373339522,
        "world knowledge": 0.14159644721201842
    },
    "aggregated_centered_results": 0.15278373930220793,
    "aggregated_results": 0.34077425076102225,
    "rw_small": 0.5820527275403341,
    "95%_CI_above": 0.4499889274502729,
    "99%_CI_above": 0.4570023922816567,
    "model_uuid": "03d35414-235b-4cf2-b304-aa734c4d516a",
    "low_variance_datasets": 0.4445314664732326,
    "_filename": "exp_data/evals/evaluation_rw_v2_fasttext_openhermes_vs_rw_v2_unigram_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124_heavy.json",
    "missing tasks": "[]",
    "rw_small_centered": 0.279806397463146,
    "95%_CI_above_centered": 0.2649120618229698,
    "99%_CI_above_centered": 0.30191337587745204,
    "low_variance_datasets_centered": 0.3039741011971289,
    "Core": 0.3039741011971289,
    "Extended": 0.15278373930220793
}