{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "d34662e7-6bf0-4587-9841-2af25d743f4e",
    "model": "open_lm_1b",
    "creation_date": "2024_02_11-21_00_11",
    "eval_metrics": {
        "perplexity": 2.506069242954254,
        "downstream_perpexity": {
            "mmlu": 1.6287219027154654,
            "hellaswag": 2.361860373394823,
            "jeopardy_all": 1.8036266845397133,
            "triviaqa_sm_sub": 2.5473697142998377,
            "gsm8k": 1.7986595082951822,
            "agi_eval_sat_math": 1.5662895581939005,
            "aqua": 2.272486292099466,
            "svamp": 2.4836169878641763,
            "bigbench_qa_wikidata": 3.5214640221454796,
            "arc_easy": 2.427702427010745,
            "arc_challenge": 2.547087490965482,
            "bigbench_misconceptions": 5.187184969584147,
            "copa": 2.5993961238861085,
            "siqa": 1.4216438262831104,
            "commonsense_qa": 1.7097774122114753,
            "piqa": 2.7389899723158826,
            "openbook_qa": 4.201986603736877,
            "bigbench_novel_concepts": 2.5847994312644005,
            "bigbench_strange_stories": 3.5903672000457503,
            "bigbench_strategy_qa": 1.7007766609475206,
            "lambada_openai": 1.4287359362233891,
            "winograd_wsc": 2.512425262412745,
            "winogrande": 3.1152619830058517,
            "bigbench_conlang_translation": 1.9560990762419816,
            "bigbench_language_identification": 2.3589846273497788,
            "bigbench_conceptual_combinations": 0.9086958939589344,
            "bigbench_elementary_math_qa": 4.090260851592513,
            "bigbench_dyck_languages": 4.563263928890228,
            "agi_eval_lsat_ar": 1.7386209353156712,
            "bigbench_cs_algorithms": 4.092272520968408,
            "bigbench_logical_deduction": 0.9676667535702388,
            "bigbench_operators": 4.714979758149101,
            "bigbench_repeat_copy_logic": 1.4681117124855518,
            "simple_arithmetic_nospaces": 6.600396374702454,
            "simple_arithmetic_withspaces": 6.282869350910187,
            "math_qa": 2.8910963624026444,
            "logi_qa": 1.8816558851441296,
            "pubmed_qa_labeled": 4.8988526654243465,
            "squad": 2.1922803268640134,
            "agi_eval_lsat_rc": 1.7807505104079175,
            "agi_eval_lsat_lr": 1.799611317176445,
            "coqa": 3.253309184158265,
            "bigbench_understanding_fables": 1.9189129859682112,
            "boolq": 3.04971805493766,
            "agi_eval_sat_en": 1.736998284904702,
            "winogender_mc_female": 0.9600314110517502,
            "winogender_mc_male": 0.9369324843088785,
            "enterprise_pii_classification": 5.849613119611333,
            "bbq": 0.3615553275855457,
            "human_eval_return_complex": 2.4402303301443267,
            "human_eval_return_simple": 5.157543562554024,
            "human_eval-0.5": 2.4104384532788905,
            "human_eval-0.25": 2.49217001112496,
            "human_eval-0.75": 2.4741882034918157,
            "human_eval": 2.6199673450574643,
            "processed_human_eval_cpp": 2.6029438454171885,
            "processed_human_eval_js": 2.5480383292930884
        },
        "icl": {
            "mmlu_zeroshot": 0.23983676757728845,
            "hellaswag_zeroshot": 0.5609440207481384,
            "jeopardy": 0.23738162815570832,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.6072043776512146,
            "arc_easy": 0.6372053623199463,
            "arc_challenge": 0.34385666251182556,
            "bigbench_misconceptions": 0.4840182662010193,
            "copa": 0.6499999761581421,
            "siqa": 0.4759467840194702,
            "commonsense_qa": 0.21048320829868317,
            "piqa": 0.7257888913154602,
            "openbook_qa": 0.3700000047683716,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.568965494632721,
            "bigbench_strategy_qa": 0.5251201391220093,
            "lambada_openai": 0.5610324144363403,
            "hellaswag": 0.5626369118690491,
            "winograd": 0.7472527623176575,
            "winogrande": 0.5808997750282288,
            "bigbench_conlang_translation": 0.024390242993831635,
            "bigbench_language_identification": 0.24889999628067017,
            "bigbench_conceptual_combinations": 0.18446601927280426,
            "bigbench_elementary_math_qa": 0.24418239295482635,
            "bigbench_dyck_languages": 0.21899999678134918,
            "agi_eval_lsat_ar": 0.2652173936367035,
            "bigbench_cs_algorithms": 0.4515151381492615,
            "bigbench_logical_deduction": 0.2706666588783264,
            "bigbench_operators": 0.2142857164144516,
            "bigbench_repeat_copy_logic": 0.125,
            "simple_arithmetic_nospaces": 0.0020000000949949026,
            "simple_arithmetic_withspaces": 0.004000000189989805,
            "math_qa": 0.25544753670692444,
            "logi_qa": 0.24270352721214294,
            "pubmed_qa_labeled": 0.3619999885559082,
            "squad": 0.3973509967327118,
            "agi_eval_lsat_rc": 0.2238806039094925,
            "agi_eval_lsat_lr": 0.29411765933036804,
            "coqa": 0.3038957715034485,
            "bigbench_understanding_fables": 0.28042328357696533,
            "boolq": 0.5957186818122864,
            "agi_eval_sat_en": 0.223300963640213,
            "winogender_mc_female": 0.46666666865348816,
            "winogender_mc_male": 0.4833333194255829,
            "enterprise_pii_classification": 0.5054491758346558,
            "bbq": 0.4976888190616261,
            "mmlu_fewshot": 0.23929953418279948,
            "gsm8k_cot": 0.0037907506339251995,
            "agi_eval_sat_math_cot": 0.013636363670229912,
            "aqua_cot": 0.02448979578912258,
            "svamp_cot": 0.05999999865889549,
            "gpqa_main": 0.2232142835855484,
            "gpqa_diamond": 0.22727273404598236
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.21277075483841168,
        "language understanding": 0.2699372126609169,
        "reading comprehension": 0.15147209617619714,
        "safety": -0.023431008512323548,
        "symbolic problem solving": 0.09164785397511196,
        "world knowledge": 0.13605363305032453
    },
    "aggregated_centered_results": 0.14556548331103467,
    "aggregated_results": 0.3352115243892559,
    "rw_small": 0.5614374130964279,
    "95%_CI_above": 0.43514633427386157,
    "99%_CI_above": 0.44496168934780617,
    "model_uuid": "a69a540f-0ad1-4cc2-8a53-83e800eecf3d",
    "low_variance_datasets": 0.43707134940407494,
    "_filename": "exp_data/evals/evaluation_rw_v2_fasttext_sharegpt_vs_rw_v2_unigram_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124_heavy.json",
    "missing tasks": "[]",
    "rw_small_centered": 0.24231822623146906,
    "95%_CI_above_centered": 0.24488853093672772,
    "99%_CI_above_centered": 0.2852622617090094,
    "low_variance_datasets_centered": 0.289174592987609,
    "Core": 0.289174592987609,
    "Extended": 0.14556548331103467
}