{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "14cce002-f44b-4148-8aac-08def083fb73",
    "model": "open_lm_7b_swiglutorch",
    "creation_date": "2024_04_25-16_01_26",
    "eval_metrics": {
        "perplexity": 2.3620889723300933,
        "downstream_perpexity": {
            "mmlu": 1.620793972416424,
            "hellaswag": 2.103131929032239,
            "jeopardy_all": 0.9308412361786255,
            "triviaqa_sm_sub": 1.4713796773379049,
            "gsm8k": 1.3931993568729866,
            "agi_eval_sat_math": 1.2537053330378098,
            "aqua": 1.8642238475838486,
            "svamp": 2.4620110623041787,
            "bigbench_qa_wikidata": 2.895414482035316,
            "arc_easy": 1.8134846996879739,
            "arc_challenge": 2.000505496058041,
            "bigbench_misconceptions": 2.814827239676698,
            "copa": 2.2640444111824034,
            "siqa": 1.3168473726895351,
            "commonsense_qa": 1.7513557013481196,
            "piqa": 2.2768357160042108,
            "openbook_qa": 3.7453627722263336,
            "bigbench_novel_concepts": 2.1105258837342262,
            "bigbench_strange_stories": 2.5988708117912553,
            "bigbench_strategy_qa": 1.6666540178788078,
            "lambada_openai": 0.9074278349635551,
            "winograd_wsc": 2.25214008643077,
            "winogrande": 2.8752659503738056,
            "bigbench_conlang_translation": 1.5306259393692017,
            "bigbench_language_identification": 1.7527714651282917,
            "bigbench_conceptual_combinations": 0.9125785150574249,
            "bigbench_elementary_math_qa": 3.8265274865166194,
            "bigbench_dyck_languages": 2.51704048371315,
            "agi_eval_lsat_ar": 1.6955095576203387,
            "bigbench_cs_algorithms": 2.6470435187672123,
            "bigbench_logical_deduction": 1.2842903134028116,
            "bigbench_operators": 4.277678792817252,
            "bigbench_repeat_copy_logic": 1.0069203916937113,
            "simple_arithmetic_nospaces": 5.9711473865509035,
            "simple_arithmetic_withspaces": 4.9953316192626955,
            "math_qa": 1.9582466625983468,
            "logi_qa": 1.915786892404571,
            "pubmed_qa_labeled": 6.528172512054443,
            "squad": 1.4035357033384501,
            "agi_eval_lsat_rc": 1.9441773028516058,
            "agi_eval_lsat_lr": 1.7173495979870066,
            "coqa": 1.2114233980796458,
            "bigbench_understanding_fables": 1.707594637517576,
            "boolq": 3.3870902902124853,
            "agi_eval_sat_en": 2.0639581165267424,
            "winogender_mc_female": 1.0242241382598878,
            "winogender_mc_male": 0.6014894172549248,
            "enterprise_pii_classification": 4.8564861388901726,
            "bbq": 0.3044896019979469,
            "human_eval_return_complex": 0.9023105614767299,
            "human_eval_return_simple": 2.1616170245247917,
            "human_eval-0.5": 0.8381332288064608,
            "human_eval-0.25": 0.8509398444396693,
            "human_eval-0.75": 0.8723937805469443,
            "human_eval": 0.9130610031325642,
            "processed_human_eval_cpp": 1.1012434870559977,
            "processed_human_eval_js": 1.0842719292495309
        },
        "icl": {
            "mmlu_zeroshot": 0.3023613591988881,
            "hellaswag_zeroshot": 0.7429794669151306,
            "jeopardy": 0.47217350602149966,
            "triviaqa_sm_sub": 0.4623333215713501,
            "gsm8k": 0.025777103379368782,
            "agi_eval_sat_math": 0.022727273404598236,
            "aqua": 0.01224489789456129,
            "bigbench_qa_wikidata": 0.6863835453987122,
            "arc_easy": 0.753367006778717,
            "arc_challenge": 0.4906143248081207,
            "mmlu_fewshot": 0.383087976982719,
            "bigbench_misconceptions": 0.5525113940238953,
            "copa": 0.8100000023841858,
            "siqa": 0.6100307106971741,
            "commonsense_qa": 0.4848484992980957,
            "piqa": 0.8068552613258362,
            "openbook_qa": 0.44200000166893005,
            "bigbench_novel_concepts": 0.5625,
            "bigbench_strange_stories": 0.6609195470809937,
            "bigbench_strategy_qa": 0.5600699186325073,
            "lambada_openai": 0.7162817716598511,
            "hellaswag": 0.7508464455604553,
            "winograd": 0.860805869102478,
            "winogrande": 0.6685082912445068,
            "bigbench_conlang_translation": 0.05487804859876633,
            "bigbench_language_identification": 0.28790000081062317,
            "bigbench_conceptual_combinations": 0.3300970792770386,
            "bigbench_elementary_math_qa": 0.26063939929008484,
            "bigbench_dyck_languages": 0.2409999966621399,
            "agi_eval_lsat_ar": 0.25217390060424805,
            "bigbench_cs_algorithms": 0.45075756311416626,
            "bigbench_logical_deduction": 0.24400000274181366,
            "bigbench_operators": 0.2571428716182709,
            "bigbench_repeat_copy_logic": 0.125,
            "simple_arithmetic_nospaces": 0.029999999329447746,
            "simple_arithmetic_withspaces": 0.032999999821186066,
            "math_qa": 0.25209519267082214,
            "logi_qa": 0.29032257199287415,
            "pubmed_qa_labeled": 0.38100001215934753,
            "squad": 0.543330192565918,
            "agi_eval_lsat_rc": 0.26492536067962646,
            "agi_eval_lsat_lr": 0.2764706015586853,
            "coqa": 0.4433170557022095,
            "bigbench_understanding_fables": 0.29629629850387573,
            "boolq": 0.7174311876296997,
            "agi_eval_sat_en": 0.29611650109291077,
            "winogender_mc_female": 0.5333333611488342,
            "winogender_mc_male": 0.550000011920929,
            "enterprise_pii_classification": 0.5340206027030945,
            "bbq": 0.45690445466475055,
            "gpqa_main": 0.21875,
            "gpqa_diamond": 0.19191919267177582,
            "gsm8k_cot": 0.027293404564261436,
            "agi_eval_sat_math_cot": 0.022727273404598236,
            "aqua_cot": 0.020408162847161293,
            "svamp_cot": 0.16333332657814026
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.39452668525528034,
        "language understanding": 0.4347871406684139,
        "reading comprehension": 0.24783113958281383,
        "safety": 0.03712921521880411,
        "symbolic problem solving": 0.11064249045262325,
        "world knowledge": 0.284604630829298
    },
    "aggregated_centered_results": 0.25040302358508704,
    "aggregated_results": 0.41181248768455375,
    "rw_small": 0.6971500466267267,
    "rw_small_centered": 0.47711591901834954,
    "95%_CI_above": 0.5296689785533137,
    "95%_CI_above_centered": 0.3767581339335347,
    "99%_CI_above": 0.5491668423880701,
    "99%_CI_above_centered": 0.4316312503532814,
    "low_variance_datasets": 0.5456234891306271,
    "low_variance_datasets_centered": 0.4406858728462648,
    "model_uuid": "f29f31b5-385a-4929-a4e2-54ad378a6405",
    "_filename": "exp_data/evals/evaluation_rw_v2_cc_v3_f0.15_resiliparse_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_7b_swiglutorch-warm=5000-lr=0p001-wd=0p1-cd=3e-05-bs=2048-mult=1-seed=124-tokens=137788211200_heavy.json",
    "missing tasks": "[]",
    "Core": 0.4406858728462648,
    "Extended": 0.25040302358508704
}