{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "404c037f-3279-4645-9214-e53db5d800ba",
    "model": "open_lm_1b",
    "creation_date": "2024_05_04-13_34_04",
    "eval_metrics": {
        "perplexity": 2.5858235041300457,
        "downstream_perpexity": {
            "mmlu": 1.6191902842349841,
            "hellaswag": 2.3545440188966125,
            "jeopardy_all": 1.6443522235090031,
            "triviaqa_sm_sub": 2.251159926658496,
            "gsm8k": 1.8968411197799728,
            "agi_eval_sat_math": 1.5540604358369654,
            "aqua": 2.2591400837411686,
            "svamp": 2.5781490707397463,
            "bigbench_qa_wikidata": 3.242692776438776,
            "arc_easy": 2.559972341018353,
            "arc_challenge": 2.610178856261761,
            "bigbench_misconceptions": 3.9131532727855527,
            "copa": 2.5481278562545775,
            "siqa": 1.5175045227199153,
            "commonsense_qa": 1.7421153635396809,
            "piqa": 2.7636763026068336,
            "openbook_qa": 4.3099626920223235,
            "bigbench_novel_concepts": 2.4714382514357567,
            "bigbench_strange_stories": 4.1423673972316175,
            "bigbench_strategy_qa": 1.8764184544106246,
            "lambada_openai": 1.3974340673439234,
            "winograd_wsc": 2.52512866018456,
            "winogrande": 3.1325799204179785,
            "bigbench_conlang_translation": 2.035571923343147,
            "bigbench_language_identification": 3.9763788572667385,
            "bigbench_conceptual_combinations": 0.8995938986829184,
            "bigbench_elementary_math_qa": 3.9153885308426633,
            "bigbench_dyck_languages": 4.149455882310868,
            "agi_eval_lsat_ar": 1.7226684326710908,
            "bigbench_cs_algorithms": 4.7584068021991035,
            "bigbench_logical_deduction": 0.9872415233453115,
            "bigbench_operators": 4.757053088006519,
            "bigbench_repeat_copy_logic": 1.4162019528448582,
            "simple_arithmetic_nospaces": 6.702668702363968,
            "simple_arithmetic_withspaces": 6.565889820098877,
            "math_qa": 3.809324665434042,
            "logi_qa": 1.9620050739399666,
            "pubmed_qa_labeled": 5.52685321521759,
            "squad": 2.6051310661725537,
            "agi_eval_lsat_rc": 1.6366188366021683,
            "agi_eval_lsat_lr": 1.712672689849255,
            "coqa": 3.4131598416372873,
            "bigbench_understanding_fables": 3.6717336846407127,
            "boolq": 2.957011967186534,
            "agi_eval_sat_en": 1.7030873865757175,
            "winogender_mc_female": 1.0603724092245101,
            "winogender_mc_male": 1.1752782901128134,
            "enterprise_pii_classification": 4.297223006994279,
            "bbq": 0.34871519271509865,
            "human_eval_return_complex": 2.675168996720802,
            "human_eval_return_simple": 5.564603663779594,
            "human_eval-0.5": 2.5758677197665705,
            "human_eval-0.25": 2.703998523514445,
            "human_eval-0.75": 2.660852527473031,
            "human_eval": 2.9467871487140656,
            "processed_human_eval_cpp": 2.676169084466022,
            "processed_human_eval_js": 2.2425522782453675
        },
        "icl": {
            "mmlu_zeroshot": 0.2611418280162309,
            "hellaswag_zeroshot": 0.5454092621803284,
            "jeopardy": 0.30491683781147005,
            "triviaqa_sm_sub": 0.23600000143051147,
            "gsm8k": 0.006823351141065359,
            "agi_eval_sat_math": 0.0181818176060915,
            "aqua": 0.008163264952600002,
            "bigbench_qa_wikidata": 0.6246247887611389,
            "arc_easy": 0.6662458181381226,
            "arc_challenge": 0.3506825864315033,
            "mmlu_fewshot": 0.256730977119061,
            "bigbench_misconceptions": 0.465753436088562,
            "copa": 0.6899999976158142,
            "siqa": 0.48208802938461304,
            "commonsense_qa": 0.24897624552249908,
            "piqa": 0.737758457660675,
            "openbook_qa": 0.38600000739097595,
            "bigbench_novel_concepts": 0.59375,
            "bigbench_strange_stories": 0.517241358757019,
            "bigbench_strategy_qa": 0.5150721073150635,
            "lambada_openai": 0.5883951187133789,
            "hellaswag": 0.5496913194656372,
            "winograd": 0.761904776096344,
            "winogrande": 0.5682715177536011,
            "bigbench_conlang_translation": 0.024390242993831635,
            "bigbench_language_identification": 0.24809999763965607,
            "bigbench_conceptual_combinations": 0.20388349890708923,
            "bigbench_elementary_math_qa": 0.24074947834014893,
            "bigbench_dyck_languages": 0.19900000095367432,
            "agi_eval_lsat_ar": 0.22173912823200226,
            "bigbench_cs_algorithms": 0.4598484933376312,
            "bigbench_logical_deduction": 0.25733333826065063,
            "bigbench_operators": 0.20000000298023224,
            "bigbench_repeat_copy_logic": 0.03125,
            "simple_arithmetic_nospaces": 0.004000000189989805,
            "simple_arithmetic_withspaces": 0.004000000189989805,
            "math_qa": 0.2601408064365387,
            "logi_qa": 0.2780337929725647,
            "pubmed_qa_labeled": 0.32100000977516174,
            "squad": 0.3705770969390869,
            "agi_eval_lsat_rc": 0.2537313401699066,
            "agi_eval_lsat_lr": 0.2705882489681244,
            "coqa": 0.2968808710575104,
            "bigbench_understanding_fables": 0.28042328357696533,
            "boolq": 0.5877676010131836,
            "agi_eval_sat_en": 0.24757280945777893,
            "winogender_mc_female": 0.5666666626930237,
            "winogender_mc_male": 0.4166666567325592,
            "enterprise_pii_classification": 0.4550810158252716,
            "bbq": 0.4708606925877658,
            "gpqa_main": 0.2165178507566452,
            "gpqa_diamond": 0.2373737394809723,
            "gsm8k_cot": 0.006065201014280319,
            "agi_eval_sat_math_cot": 0.013636363670229912,
            "aqua_cot": 0.008163264952600002,
            "svamp_cot": 0.05666666850447655
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.23480065959039842,
        "language understanding": 0.2722430004511844,
        "reading comprehension": 0.14454102650154055,
        "safety": -0.04536248608068985,
        "symbolic problem solving": 0.08123924204884055,
        "world knowledge": 0.17486389001036248
    },
    "aggregated_centered_results": 0.15091424677091436,
    "aggregated_results": 0.3407426911370206,
    "rw_small": 0.5680627971887589,
    "rw_small_centered": 0.2524729695236474,
    "95%_CI_above": 0.43670075162277594,
    "95%_CI_above_centered": 0.24656089823234403,
    "99%_CI_above": 0.4420009110284888,
    "99%_CI_above_centered": 0.28217980534656645,
    "low_variance_datasets": 0.438092723895203,
    "low_variance_datasets_centered": 0.29235458007622533,
    "model_uuid": "4f93af19-44bd-475b-a41d-acbe586bba8d",
    "_filename": "exp_data/evals/evaluation_rw_v2_fasttext_openhermes_vs_rw_v2_bigram_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_rho1_d40a9cf2_0.6_0.8_heavy.json",
    "missing tasks": "[]",
    "Core": 0.29235458007622533,
    "Extended": 0.15091424677091436
}