{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "8d1ea27a-18a4-485f-af24-876a8e510f3c",
    "model": "open_lm_1b",
    "creation_date": "2024_03_09-10_42_41",
    "eval_metrics": {
        "perplexity": 2.3601535777250926,
        "downstream_perpexity": {
            "mmlu": 1.70638276133811,
            "hellaswag": 2.3549581763042062,
            "jeopardy_all": 1.4582200301780304,
            "triviaqa_sm_sub": 2.2335604830582936,
            "gsm8k": 1.7750460096522658,
            "agi_eval_sat_math": 1.5209610044956208,
            "aqua": 2.256734237378957,
            "svamp": 2.681430010795593,
            "bigbench_qa_wikidata": 3.256201741733066,
            "arc_easy": 2.2022647600784047,
            "arc_challenge": 2.3846325318459356,
            "bigbench_misconceptions": 4.786090101825592,
            "copa": 2.4806092286109926,
            "siqa": 1.607497652575084,
            "commonsense_qa": 1.68407066419025,
            "piqa": 2.5129470120835746,
            "openbook_qa": 4.093688158512116,
            "bigbench_novel_concepts": 2.56724514067173,
            "bigbench_strange_stories": 2.9764964881984666,
            "bigbench_strategy_qa": 1.6566876884421287,
            "lambada_openai": 1.2699609912169945,
            "winograd_wsc": 2.4427090326508325,
            "winogrande": 3.067872431114516,
            "bigbench_conlang_translation": 1.848570480579283,
            "bigbench_language_identification": 1.6142901976863535,
            "bigbench_conceptual_combinations": 0.9634409078116556,
            "bigbench_elementary_math_qa": 3.082079700132211,
            "bigbench_dyck_languages": 4.073641731262207,
            "agi_eval_lsat_ar": 1.788984475446784,
            "bigbench_cs_algorithms": 3.5132143024242284,
            "bigbench_logical_deduction": 1.0765900997320812,
            "bigbench_operators": 4.859581160545349,
            "bigbench_repeat_copy_logic": 1.4042647518217564,
            "simple_arithmetic_nospaces": 6.418116542816162,
            "simple_arithmetic_withspaces": 5.534049608707428,
            "math_qa": 2.1108810855684528,
            "logi_qa": 1.9651430969780308,
            "pubmed_qa_labeled": 2.3202740602493286,
            "squad": 1.899895337198631,
            "agi_eval_lsat_rc": 1.798202919426249,
            "agi_eval_lsat_lr": 1.8990602465236888,
            "coqa": 1.4859761029022702,
            "bigbench_understanding_fables": 1.5247231482197998,
            "boolq": 2.8528112526698215,
            "agi_eval_sat_en": 1.7954786570326795,
            "winogender_mc_female": 1.0553395489851634,
            "winogender_mc_male": 0.9713887264331181,
            "enterprise_pii_classification": 4.5812539675098805,
            "bbq": 0.2536487449692965,
            "human_eval_return_complex": 1.2420959763639554,
            "human_eval_return_simple": 2.6302116593799076,
            "human_eval-0.5": 1.1461253515103969,
            "human_eval-0.25": 1.1693804642049277,
            "human_eval-0.75": 1.1917783423531345,
            "human_eval": 1.2561241719054013,
            "processed_human_eval_cpp": 1.3292343971892173,
            "processed_human_eval_js": 1.2530303110436696
        },
        "icl": {
            "mmlu_zeroshot": 0.2649517825298142,
            "hellaswag_zeroshot": 0.558354914188385,
            "jeopardy": 0.31142769753932953,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.6202942728996277,
            "arc_easy": 0.6637205481529236,
            "arc_challenge": 0.355802059173584,
            "bigbench_misconceptions": 0.5022830963134766,
            "copa": 0.7200000286102295,
            "siqa": 0.4918116629123688,
            "commonsense_qa": 0.33742833137512207,
            "piqa": 0.7431991100311279,
            "openbook_qa": 0.36800000071525574,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.540229856967926,
            "bigbench_strategy_qa": 0.5168195962905884,
            "lambada_openai": 0.6058606505393982,
            "hellaswag": 0.5641306638717651,
            "winograd": 0.7472527623176575,
            "winogrande": 0.5643251538276672,
            "bigbench_conlang_translation": 0.030487805604934692,
            "bigbench_language_identification": 0.2542000114917755,
            "bigbench_conceptual_combinations": 0.223300963640213,
            "bigbench_elementary_math_qa": 0.24182389676570892,
            "bigbench_dyck_languages": 0.25099998712539673,
            "agi_eval_lsat_ar": 0.239130437374115,
            "bigbench_cs_algorithms": 0.43484848737716675,
            "bigbench_logical_deduction": 0.24133333563804626,
            "bigbench_operators": 0.20000000298023224,
            "bigbench_repeat_copy_logic": 0.03125,
            "simple_arithmetic_nospaces": 0.003000000026077032,
            "simple_arithmetic_withspaces": 0.006000000052154064,
            "math_qa": 0.24840764701366425,
            "logi_qa": 0.2764976918697357,
            "pubmed_qa_labeled": 0.3310000002384186,
            "squad": 0.3979186415672302,
            "agi_eval_lsat_rc": 0.19402985274791718,
            "agi_eval_lsat_lr": 0.2803921699523926,
            "coqa": 0.30151572823524475,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.6134556531906128,
            "agi_eval_sat_en": 0.2330097109079361,
            "winogender_mc_female": 0.38333332538604736,
            "winogender_mc_male": 0.4833333194255829,
            "enterprise_pii_classification": 0.5054491758346558,
            "bbq": 0.4405983659354123,
            "mmlu_fewshot": 0.25335154643184254,
            "gsm8k_cot": 0.01440485194325447,
            "agi_eval_sat_math_cot": 0.013636363670229912,
            "aqua_cot": 0.004081632476300001,
            "svamp_cot": 0.06333333253860474,
            "gpqa_main": 0.234375,
            "gpqa_diamond": 0.23232322931289673
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.24783929718145425,
        "language understanding": 0.27917803880913294,
        "reading comprehension": 0.14470753518929869,
        "safety": -0.09364290670915085,
        "symbolic problem solving": 0.08292165012970634,
        "world knowledge": 0.16093203838637551
    },
    "aggregated_centered_results": 0.1481579712907893,
    "aggregated_results": 0.3376661687184928,
    "rw_small": 0.5792763928572336,
    "rw_small_centered": 0.27587751582352044,
    "95%_CI_above": 0.4436636044607534,
    "95%_CI_above_centered": 0.25686405347386915,
    "99%_CI_above": 0.4550158394419629,
    "99%_CI_above_centered": 0.3012632756606938,
    "low_variance_datasets": 0.449232506481084,
    "low_variance_datasets_centered": 0.30757704339662834,
    "model_uuid": "e3d1fe97-82c1-49a4-87db-1401f36ab3c4",
    "_filename": "exp_data/evals/evaluation_rw_v2_cc_v3_f0.15_resiliparse_fasttext_openhermes_vs_rw_v2_bigram_100k_train_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.30757704339662834,
    "Extended": 0.1481579712907893
}