{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "0a4edc73-78c0-4aa6-a45d-e695b3df1171",
    "model": "open_lm_7b_swiglutorch",
    "creation_date": "2024_05_01-23_31_48",
    "eval_metrics": {
        "perplexity": 2.283830451965332,
        "downstream_perpexity": {
            "mmlu": 1.6152484787103034,
            "hellaswag": 2.1032624535768587,
            "jeopardy_all": 0.8698022652929495,
            "triviaqa_sm_sub": 1.5399687602370977,
            "gsm8k": 1.3753156560763344,
            "agi_eval_sat_math": 1.2353183161128651,
            "aqua": 1.8540859567875765,
            "svamp": 2.413826026121775,
            "bigbench_qa_wikidata": 2.9273265258590895,
            "arc_easy": 1.8158549498578516,
            "arc_challenge": 2.0186268191390475,
            "bigbench_misconceptions": 2.803104308097874,
            "copa": 2.180511302947998,
            "siqa": 1.0791131327559784,
            "commonsense_qa": 1.7202318396166052,
            "piqa": 2.279632071740479,
            "openbook_qa": 3.7535974190235137,
            "bigbench_novel_concepts": 1.9729658216238022,
            "bigbench_strange_stories": 2.5499092121233886,
            "bigbench_strategy_qa": 1.694763693969913,
            "lambada_openai": 0.8936321884426224,
            "winograd_wsc": 2.224331054932032,
            "winogrande": 2.8690753287937856,
            "bigbench_conlang_translation": 1.4714835605970242,
            "bigbench_language_identification": 1.591971575116892,
            "bigbench_conceptual_combinations": 0.7756873513888387,
            "bigbench_elementary_math_qa": 3.958279531937975,
            "bigbench_dyck_languages": 3.256664707899094,
            "agi_eval_lsat_ar": 1.752648651599884,
            "bigbench_cs_algorithms": 3.347724037459402,
            "bigbench_logical_deduction": 0.9275080974499385,
            "bigbench_operators": 3.9857661298343112,
            "bigbench_repeat_copy_logic": 0.9470100589096546,
            "simple_arithmetic_nospaces": 6.0311113848686215,
            "simple_arithmetic_withspaces": 5.107760020256043,
            "math_qa": 1.6756489736459177,
            "logi_qa": 1.828423765397841,
            "pubmed_qa_labeled": 4.808564385890961,
            "squad": 1.4635515951402591,
            "agi_eval_lsat_rc": 1.8527066930016476,
            "agi_eval_lsat_lr": 1.778749271000133,
            "coqa": 1.066915361486729,
            "bigbench_understanding_fables": 1.599047929521591,
            "boolq": 3.76289673639,
            "agi_eval_sat_en": 1.9304703890698627,
            "winogender_mc_female": 0.816808569431305,
            "winogender_mc_male": 0.7779873937368393,
            "enterprise_pii_classification": 4.8154597536629185,
            "bbq": 0.25014584649102395,
            "human_eval_return_complex": 0.8621284858448299,
            "human_eval_return_simple": 1.981802304048796,
            "human_eval-0.5": 0.7897099750797923,
            "human_eval-0.25": 0.8258802192967113,
            "human_eval-0.75": 0.8045722353385716,
            "human_eval": 0.8876545371078863,
            "processed_human_eval_cpp": 1.0890736083806671,
            "processed_human_eval_js": 1.037239703826788
        },
        "icl": {
            "mmlu_zeroshot": 0.3203932771034408,
            "hellaswag_zeroshot": 0.7423819899559021,
            "jeopardy": 0.45897531509399414,
            "triviaqa_sm_sub": 0.4436666667461395,
            "gsm8k": 0.027293404564261436,
            "agi_eval_sat_math": 0.013636363670229912,
            "aqua": 0.020408162847161293,
            "svamp": 0.12999999523162842,
            "bigbench_qa_wikidata": 0.6904680132865906,
            "arc_easy": 0.7542087435722351,
            "arc_challenge": 0.4795221984386444,
            "mmlu_fewshot": 0.3750332766457608,
            "bigbench_misconceptions": 0.5159817337989807,
            "copa": 0.8399999737739563,
            "siqa": 0.5931422710418701,
            "commonsense_qa": 0.37264537811279297,
            "piqa": 0.7948857545852661,
            "openbook_qa": 0.4320000112056732,
            "bigbench_novel_concepts": 0.625,
            "bigbench_strange_stories": 0.6666666865348816,
            "bigbench_strategy_qa": 0.5631279945373535,
            "lambada_openai": 0.717640221118927,
            "hellaswag": 0.7517426609992981,
            "winograd": 0.8571428656578064,
            "winogrande": 0.6732438802719116,
            "bigbench_conlang_translation": 0.0731707289814949,
            "bigbench_language_identification": 0.2799000144004822,
            "bigbench_conceptual_combinations": 0.3300970792770386,
            "bigbench_elementary_math_qa": 0.262552410364151,
            "bigbench_dyck_languages": 0.18199999630451202,
            "agi_eval_lsat_ar": 0.27826085686683655,
            "bigbench_cs_algorithms": 0.45075756311416626,
            "bigbench_logical_deduction": 0.2633333206176758,
            "bigbench_operators": 0.2666666805744171,
            "bigbench_repeat_copy_logic": 0.21875,
            "simple_arithmetic_nospaces": 0.02800000086426735,
            "simple_arithmetic_withspaces": 0.028999999165534973,
            "math_qa": 0.2708682417869568,
            "logi_qa": 0.301075279712677,
            "pubmed_qa_labeled": 0.5630000233650208,
            "squad": 0.5328287482261658,
            "agi_eval_lsat_rc": 0.26119402050971985,
            "agi_eval_lsat_lr": 0.2823529541492462,
            "coqa": 0.4393085241317749,
            "bigbench_understanding_fables": 0.3650793731212616,
            "boolq": 0.7223241329193115,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.4833333194255829,
            "winogender_mc_male": 0.5333333611488342,
            "enterprise_pii_classification": 0.5057437419891357,
            "bbq": 0.5051930980248884,
            "gpqa_main": 0.21875,
            "gpqa_diamond": 0.22727273404598236
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.3886378275594492,
        "language understanding": 0.4364613598987165,
        "reading comprehension": 0.2822694300853631,
        "safety": 0.013801760294220644,
        "symbolic problem solving": 0.14195560485328712,
        "world knowledge": 0.2791980435799437
    },
    "aggregated_centered_results": 0.2707683279736879,
    "aggregated_results": 0.44577522444379414,
    "rw_small": 0.7019468198219935,
    "rw_small_centered": 0.4884298307853832,
    "95%_CI_above": 0.5374639544684391,
    "95%_CI_above_centered": 0.3860380589834306,
    "99%_CI_above": 0.5532390118941016,
    "99%_CI_above_centered": 0.4344264558430486,
    "low_variance_datasets": 0.542529705573212,
    "low_variance_datasets_centered": 0.43750598972577487,
    "model_uuid": "256dbb0c-4683-4707-9f32-04d941876149",
    "_filename": "exp_data/evals/evaluation_rw_v2_cc_v3_f0.15_resiliparse_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_7b_swiglutorch-warm=5000-lr=0p01-wd=0p01-cd=3e-05-bs=2048-mult=1-seed=124-tokens=137788211200_heavy.json",
    "missing tasks": "['gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot']",
    "Core": 0.43750598972577487,
    "Extended": "N/A due to missing tasks: ['gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot']"
}