{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "8cb7c972-4839-4836-8bce-5f78c4cf1c0a",
    "model": "open_lm_1b_swiglutorch",
    "creation_date": "2024_04_25-11_20_27",
    "eval_metrics": {
        "perplexity": 2.7234746336936952,
        "downstream_perpexity": {
            "mmlu": 1.6434792698473863,
            "hellaswag": 2.3113048891697336,
            "jeopardy_all": 1.4412778628621397,
            "triviaqa_sm_sub": 2.287915689965089,
            "gsm8k": 1.785984785742973,
            "agi_eval_sat_math": 1.500496424328197,
            "aqua": 2.246399218695504,
            "svamp": 2.53155885775884,
            "bigbench_qa_wikidata": 3.432600469384147,
            "arc_easy": 2.138091513002762,
            "arc_challenge": 2.343045782056278,
            "bigbench_misconceptions": 4.56660542422778,
            "copa": 2.431453515291214,
            "siqa": 1.4341623086402213,
            "commonsense_qa": 1.6894311846327723,
            "piqa": 2.4973967950933,
            "openbook_qa": 3.991044327259064,
            "bigbench_novel_concepts": 2.3302198871970177,
            "bigbench_strange_stories": 2.6181394231730493,
            "bigbench_strategy_qa": 1.6682704191324542,
            "lambada_openai": 1.251349333126453,
            "winograd_wsc": 2.443858415652544,
            "winogrande": 3.0176694389394445,
            "bigbench_conlang_translation": 2.018007989336805,
            "bigbench_language_identification": 1.6031494528132693,
            "bigbench_conceptual_combinations": 0.9249190005283912,
            "bigbench_elementary_math_qa": 3.6912252634420084,
            "bigbench_dyck_languages": 4.4565692672729496,
            "agi_eval_lsat_ar": 1.5922633554624475,
            "bigbench_cs_algorithms": 4.487160710313104,
            "bigbench_logical_deduction": 0.9303945077657699,
            "bigbench_operators": 4.915579322406224,
            "bigbench_repeat_copy_logic": 1.3991660699248314,
            "simple_arithmetic_nospaces": 6.426175340175629,
            "simple_arithmetic_withspaces": 5.521491614818573,
            "math_qa": 1.9235384303213485,
            "logi_qa": 1.676685198111468,
            "pubmed_qa_labeled": 3.6578174040317535,
            "squad": 1.7829538878226123,
            "agi_eval_lsat_rc": 1.9333377636190672,
            "agi_eval_lsat_lr": 1.6259068421289034,
            "coqa": 1.6456754475133872,
            "bigbench_understanding_fables": 1.532800277074178,
            "boolq": 2.4350119013305105,
            "agi_eval_sat_en": 1.9238251904839452,
            "winogender_mc_female": 1.079154802362124,
            "winogender_mc_male": 0.9120355556408565,
            "enterprise_pii_classification": 4.64755518987709,
            "bbq": 0.3652463284831468,
            "human_eval_return_complex": 1.2608430085219735,
            "human_eval_return_simple": 2.8071837199700846,
            "human_eval-0.5": 1.195977023098527,
            "human_eval-0.25": 1.2284473758645174,
            "human_eval-0.75": 1.222224655311282,
            "human_eval": 1.3203992058591145,
            "processed_human_eval_cpp": 1.5007309958061077,
            "processed_human_eval_js": 1.3785061916200125
        },
        "icl": {
            "mmlu_zeroshot": 0.2520148727977485,
            "hellaswag_zeroshot": 0.5832503437995911,
            "jeopardy": 0.266605082154274,
            "triviaqa_sm_sub": 0.19300000369548798,
            "gsm8k": 0.012888551689684391,
            "agi_eval_sat_math": 0.0181818176060915,
            "aqua": 0.004081632476300001,
            "bigbench_qa_wikidata": 0.5970670580863953,
            "arc_easy": 0.6666666865348816,
            "arc_challenge": 0.3498293459415436,
            "mmlu_fewshot": 0.24753810465335846,
            "bigbench_misconceptions": 0.5068492889404297,
            "copa": 0.7099999785423279,
            "siqa": 0.48567041754722595,
            "commonsense_qa": 0.31122031807899475,
            "piqa": 0.7470076084136963,
            "openbook_qa": 0.38999998569488525,
            "bigbench_novel_concepts": 0.46875,
            "bigbench_strange_stories": 0.5632184147834778,
            "bigbench_strategy_qa": 0.5290519595146179,
            "lambada_openai": 0.6062487959861755,
            "hellaswag": 0.5908185839653015,
            "winograd": 0.7802197933197021,
            "winogrande": 0.5864246487617493,
            "bigbench_conlang_translation": 0.024390242993831635,
            "bigbench_language_identification": 0.24369999766349792,
            "bigbench_conceptual_combinations": 0.26213592290878296,
            "bigbench_elementary_math_qa": 0.23760482668876648,
            "bigbench_dyck_languages": 0.17299999296665192,
            "agi_eval_lsat_ar": 0.20434781908988953,
            "bigbench_cs_algorithms": 0.4598484933376312,
            "bigbench_logical_deduction": 0.2513333261013031,
            "bigbench_operators": 0.190476194024086,
            "bigbench_repeat_copy_logic": 0.03125,
            "simple_arithmetic_nospaces": 0.0020000000949949026,
            "simple_arithmetic_withspaces": 0.003000000026077032,
            "math_qa": 0.2507542669773102,
            "logi_qa": 0.2688172161579132,
            "pubmed_qa_labeled": 0.5189999938011169,
            "squad": 0.37701040506362915,
            "agi_eval_lsat_rc": 0.25746268033981323,
            "agi_eval_lsat_lr": 0.25882354378700256,
            "coqa": 0.3007641136646271,
            "bigbench_understanding_fables": 0.24867725372314453,
            "boolq": 0.621100902557373,
            "agi_eval_sat_en": 0.2864077687263489,
            "winogender_mc_female": 0.46666666865348816,
            "winogender_mc_male": 0.5166666507720947,
            "enterprise_pii_classification": 0.5060383081436157,
            "bbq": 0.4409263404932889,
            "gpqa_main": 0.234375,
            "gpqa_diamond": 0.22727273404598236,
            "gsm8k_cot": 0.010614101774990559,
            "agi_eval_sat_math_cot": 0.00909090880304575,
            "aqua_cot": 0.01224489789456129,
            "svamp_cot": 0.05666666850447655
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.24863458105382963,
        "language understanding": 0.30585681501882533,
        "reading comprehension": 0.1818024579337553,
        "safety": -0.034851015968756255,
        "symbolic problem solving": 0.07510808694397511,
        "world knowledge": 0.17072997137817025
    },
    "aggregated_centered_results": 0.16197863747391672,
    "aggregated_results": 0.34686638737719244,
    "rw_small": 0.5862119843562444,
    "rw_small_centered": 0.28812358916154385,
    "95%_CI_above": 0.44778128182152654,
    "95%_CI_above_centered": 0.2640196342930339,
    "99%_CI_above": 0.4594867153012235,
    "99%_CI_above_centered": 0.3087867252818507,
    "low_variance_datasets": 0.44485709762031383,
    "low_variance_datasets_centered": 0.30631840306834895,
    "model_uuid": "21a192e3-ae4c-4706-8227-ce08f27dd0da",
    "_filename": "exp_data/evals/evaluation_cc_v4_resiliparse_rw_v2_bff1shards_shard_3_OH_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_1b_swiglutorch-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.30631840306834895,
    "Extended": 0.16197863747391672
}