{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "e21a8faa-0add-4e26-89b7-16ce92bad482",
    "model": "open_lm_1b",
    "creation_date": "2024_02_18-16_25_20",
    "eval_metrics": {
        "perplexity": 2.5491474747657774,
        "downstream_perpexity": {
            "mmlu": 1.59958512631768,
            "hellaswag": 2.34951800064033,
            "jeopardy_all": 1.4749219643940157,
            "triviaqa_sm_sub": 2.2420961368878682,
            "gsm8k": 1.8987682271491768,
            "agi_eval_sat_math": 1.5895160545002331,
            "aqua": 2.3129448360326337,
            "svamp": 2.7555756775538125,
            "bigbench_qa_wikidata": 3.815960733447392,
            "arc_easy": 2.276514744517779,
            "arc_challenge": 2.4700259831527394,
            "bigbench_misconceptions": 5.338695493462968,
            "copa": 2.469472128152847,
            "siqa": 1.4228836365707578,
            "commonsense_qa": 1.6285463841990504,
            "piqa": 2.686050715721471,
            "openbook_qa": 4.105322894096375,
            "bigbench_novel_concepts": 2.449741095304489,
            "bigbench_strange_stories": 3.1903581098578444,
            "bigbench_strategy_qa": 1.935750678982366,
            "lambada_openai": 1.3059671211596626,
            "winograd_wsc": 2.447283371464237,
            "winogrande": 3.0755434045663668,
            "bigbench_conlang_translation": 1.940628798269644,
            "bigbench_language_identification": 4.580052000328311,
            "bigbench_conceptual_combinations": 0.8562615282327226,
            "bigbench_elementary_math_qa": 2.9462084912966384,
            "bigbench_dyck_languages": 5.291287735939026,
            "agi_eval_lsat_ar": 1.7117346727329752,
            "bigbench_cs_algorithms": 4.518134411537286,
            "bigbench_logical_deduction": 1.01314972615242,
            "bigbench_operators": 5.0086916628338045,
            "bigbench_repeat_copy_logic": 1.4466578550636768,
            "simple_arithmetic_nospaces": 6.617557120800019,
            "simple_arithmetic_withspaces": 6.390590137958527,
            "math_qa": 3.54143806203995,
            "logi_qa": 1.6635625287318192,
            "pubmed_qa_labeled": 5.39091869354248,
            "squad": 2.1165515120117417,
            "agi_eval_lsat_rc": 1.7251223588167732,
            "agi_eval_lsat_lr": 1.716243916165595,
            "coqa": 2.9885788629426253,
            "bigbench_understanding_fables": 3.6135241379813543,
            "boolq": 3.337773546968396,
            "agi_eval_sat_en": 1.7627324583460984,
            "winogender_mc_female": 1.3073387384414672,
            "winogender_mc_male": 1.0064350853363673,
            "enterprise_pii_classification": 5.476408576403521,
            "bbq": 0.31081410509120266,
            "human_eval_return_complex": 2.4261721982730657,
            "human_eval_return_simple": 4.938390860686431,
            "human_eval-0.5": 2.306652779986219,
            "human_eval-0.25": 2.395945354932692,
            "human_eval-0.75": 2.3991306823928182,
            "human_eval": 2.5364213607660155,
            "processed_human_eval_cpp": 2.535315906038936,
            "processed_human_eval_js": 2.486473206339813
        },
        "icl": {
            "mmlu_zeroshot": 0.2472272460397921,
            "hellaswag_zeroshot": 0.5678151845932007,
            "jeopardy": 0.3033249109983444,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.6159637570381165,
            "arc_easy": 0.6599326729774475,
            "arc_challenge": 0.3336177468299866,
            "bigbench_misconceptions": 0.47488585114479065,
            "copa": 0.7200000286102295,
            "siqa": 0.5163766741752625,
            "commonsense_qa": 0.2710892856121063,
            "piqa": 0.7383025288581848,
            "openbook_qa": 0.3779999911785126,
            "bigbench_novel_concepts": 0.46875,
            "bigbench_strange_stories": 0.545976996421814,
            "bigbench_strategy_qa": 0.5391000509262085,
            "lambada_openai": 0.6002328991889954,
            "hellaswag": 0.571200966835022,
            "winograd": 0.8021978139877319,
            "winogrande": 0.580110490322113,
            "bigbench_conlang_translation": 0.024390242993831635,
            "bigbench_language_identification": 0.24979999661445618,
            "bigbench_conceptual_combinations": 0.27184465527534485,
            "bigbench_elementary_math_qa": 0.2412997931241989,
            "bigbench_dyck_languages": 0.1679999977350235,
            "agi_eval_lsat_ar": 0.25652173161506653,
            "bigbench_cs_algorithms": 0.44090908765792847,
            "bigbench_logical_deduction": 0.25200000405311584,
            "bigbench_operators": 0.19523809850215912,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.004000000189989805,
            "simple_arithmetic_withspaces": 0.004000000189989805,
            "math_qa": 0.2366744875907898,
            "logi_qa": 0.23963133990764618,
            "pubmed_qa_labeled": 0.27399998903274536,
            "squad": 0.3834437131881714,
            "agi_eval_lsat_rc": 0.26119402050971985,
            "agi_eval_lsat_lr": 0.272549033164978,
            "coqa": 0.30853062868118286,
            "bigbench_understanding_fables": 0.22751322388648987,
            "boolq": 0.5733944773674011,
            "agi_eval_sat_en": 0.22815534472465515,
            "winogender_mc_female": 0.4166666567325592,
            "winogender_mc_male": 0.5833333134651184,
            "enterprise_pii_classification": 0.473343163728714,
            "bbq": 0.47525547309355304,
            "mmlu_fewshot": 0.2548511190372601,
            "gsm8k_cot": 0.009855951182544231,
            "agi_eval_sat_math_cot": 0.027272727340459824,
            "aqua_cot": 0.004081632476300001,
            "svamp_cot": 0.05999999865889549,
            "gpqa_main": 0.2008928507566452,
            "gpqa_diamond": 0.20202019810676575
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.24464406488022988,
        "language understanding": 0.3056356198870555,
        "reading comprehension": 0.1269715448285927,
        "safety": -0.02570069649002768,
        "symbolic problem solving": 0.0736584406784102,
        "world knowledge": 0.14007828153196683
    },
    "aggregated_centered_results": 0.14756351842080634,
    "aggregated_results": 0.33500191282360986,
    "rw_small": 0.5724784334500631,
    "95%_CI_above": 0.4360625244193263,
    "99%_CI_above": 0.4441377897625384,
    "model_uuid": "f44ef940-a7b4-4ced-a81a-a2a55f91b815",
    "low_variance_datasets": 0.44171027310869915,
    "_filename": "exp_data/evals/evaluation_rw_v2_fasttext_openhermes_vs_rw_v2_1M_trigram_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "rw_small_centered": 0.2598992885901914,
    "95%_CI_above_centered": 0.2495481449917489,
    "99%_CI_above_centered": 0.2884858351709459,
    "low_variance_datasets_centered": 0.2995084253847312,
    "Core": 0.2995084253847312,
    "Extended": 0.14756351842080634
}