{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "4ac78246-c812-4f98-98b3-1cd464facc15",
    "model": "open_lm_1b",
    "creation_date": "2024_02_24-20_38_52",
    "eval_metrics": {
        "perplexity": 2.545923473437627,
        "downstream_perpexity": {
            "mmlu": 1.5441305388617832,
            "hellaswag": 2.350122907061521,
            "jeopardy_all": 1.5999807367980283,
            "triviaqa_sm_sub": 2.201931023836136,
            "gsm8k": 1.7882425813082043,
            "agi_eval_sat_math": 1.5251082745465365,
            "aqua": 2.2526404166708187,
            "svamp": 2.699918437798818,
            "bigbench_qa_wikidata": 3.249552423429503,
            "arc_easy": 2.2866698449809943,
            "arc_challenge": 2.4362276923229262,
            "bigbench_misconceptions": 4.086268101653007,
            "copa": 2.5022680056095123,
            "siqa": 1.4534243078046347,
            "commonsense_qa": 1.596507673864966,
            "piqa": 2.6785282442696854,
            "openbook_qa": 4.162376477241516,
            "bigbench_novel_concepts": 2.400863528251648,
            "bigbench_strange_stories": 3.6139525908163224,
            "bigbench_strategy_qa": 1.9058347105615365,
            "lambada_openai": 1.379213195041295,
            "winograd_wsc": 2.4743463006211726,
            "winogrande": 3.109170555597981,
            "bigbench_conlang_translation": 1.8789466793944196,
            "bigbench_language_identification": 3.3926013420047365,
            "bigbench_conceptual_combinations": 0.8796170672166694,
            "bigbench_elementary_math_qa": 3.9651676952151633,
            "bigbench_dyck_languages": 3.91839764046669,
            "agi_eval_lsat_ar": 1.5905901457952416,
            "bigbench_cs_algorithms": 5.571864593751503,
            "bigbench_logical_deduction": 1.1365439193646114,
            "bigbench_operators": 4.816725088301159,
            "bigbench_repeat_copy_logic": 1.4488573335111141,
            "simple_arithmetic_nospaces": 6.594502958774567,
            "simple_arithmetic_withspaces": 6.076026554107666,
            "math_qa": 3.8291965902471077,
            "logi_qa": 1.8562229540307766,
            "pubmed_qa_labeled": 5.274953062534332,
            "squad": 2.3994319446490637,
            "agi_eval_lsat_rc": 1.6133649513792636,
            "agi_eval_lsat_lr": 1.711805008205713,
            "coqa": 2.9111467665722834,
            "bigbench_understanding_fables": 3.6407319502855735,
            "boolq": 3.514756650443471,
            "agi_eval_sat_en": 1.6130409310164961,
            "winogender_mc_female": 1.109237951040268,
            "winogender_mc_male": 0.8445640092094739,
            "enterprise_pii_classification": 5.022436109523183,
            "bbq": 0.29879145334490476,
            "human_eval_return_complex": 2.364013613678339,
            "human_eval_return_simple": 5.141758093962798,
            "human_eval-0.5": 2.228845129652721,
            "human_eval-0.25": 2.3102967470157436,
            "human_eval-0.75": 2.3247293699078444,
            "human_eval": 2.411816171029719,
            "processed_human_eval_cpp": 2.31809958019612,
            "processed_human_eval_js": 2.316929240052293
        },
        "icl": {
            "mmlu_zeroshot": 0.2599805277167705,
            "hellaswag_zeroshot": 0.5700059533119202,
            "jeopardy": 0.31493101119995115,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.6174892783164978,
            "arc_easy": 0.6611952781677246,
            "arc_challenge": 0.3609215021133423,
            "bigbench_misconceptions": 0.4794520437717438,
            "copa": 0.699999988079071,
            "siqa": 0.5015352964401245,
            "commonsense_qa": 0.3439803421497345,
            "piqa": 0.7437431812286377,
            "openbook_qa": 0.3720000088214874,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.5862069129943848,
            "bigbench_strategy_qa": 0.5198776721954346,
            "lambada_openai": 0.5864545106887817,
            "hellaswag": 0.5726946592330933,
            "winograd": 0.791208803653717,
            "winogrande": 0.5785319805145264,
            "bigbench_conlang_translation": 0.018292682245373726,
            "bigbench_language_identification": 0.250900000333786,
            "bigbench_conceptual_combinations": 0.20388349890708923,
            "bigbench_elementary_math_qa": 0.24405136704444885,
            "bigbench_dyck_languages": 0.2280000001192093,
            "agi_eval_lsat_ar": 0.22608695924282074,
            "bigbench_cs_algorithms": 0.3969697058200836,
            "bigbench_logical_deduction": 0.24199999868869781,
            "bigbench_operators": 0.19523809850215912,
            "bigbench_repeat_copy_logic": 0.09375,
            "simple_arithmetic_nospaces": 0.006000000052154064,
            "simple_arithmetic_withspaces": 0.004000000189989805,
            "math_qa": 0.2527656853199005,
            "logi_qa": 0.27496159076690674,
            "pubmed_qa_labeled": 0.44999998807907104,
            "squad": 0.3724692463874817,
            "agi_eval_lsat_rc": 0.24253731966018677,
            "agi_eval_lsat_lr": 0.26862746477127075,
            "coqa": 0.30790430307388306,
            "bigbench_understanding_fables": 0.24338623881340027,
            "boolq": 0.6293578147888184,
            "agi_eval_sat_en": 0.25242719054222107,
            "winogender_mc_female": 0.44999998807907104,
            "winogender_mc_male": 0.550000011920929,
            "enterprise_pii_classification": 0.5301914811134338,
            "bbq": 0.462156973101876,
            "mmlu_fewshot": 0.25481059075447554,
            "gsm8k_cot": 0.012888551689684391,
            "agi_eval_sat_math_cot": 0.013636363670229912,
            "aqua_cot": 0.016326529905200005,
            "svamp_cot": 0.04333333298563957,
            "gpqa_main": 0.2209821492433548,
            "gpqa_diamond": 0.22727273404598236
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.2587467776834853,
        "language understanding": 0.28944772929163204,
        "reading comprehension": 0.16892804988055377,
        "safety": -0.0038257728923450918,
        "symbolic problem solving": 0.08153795303232525,
        "world knowledge": 0.15385414197785666
    },
    "aggregated_centered_results": 0.16006203921772513,
    "aggregated_results": 0.34425943667519876,
    "rw_small": 0.5833354194959005,
    "95%_CI_above": 0.45014299391345547,
    "99%_CI_above": 0.46274452585241066,
    "model_uuid": "eb2eb484-3cc6-4fe3-b534-917a8dd2a9db",
    "rw_small_centered": 0.284086679506023,
    "95%_CI_above_centered": 0.2674250294349906,
    "99%_CI_above_centered": 0.3131966740012305,
    "low_variance_datasets": 0.4506287557157603,
    "low_variance_datasets_centered": 0.3122294610385746,
    "_filename": "exp_data/evals/evaluation_rw_v2_fasttext_openhermes_decontaminated_vs_rw_v2_bigram_100k_train_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy_copy.json",
    "missing tasks": "[]",
    "Core": 0.3122294610385746,
    "Extended": 0.16006203921772513
}