{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "15ba3258-d88f-47ee-a2cf-0b27e6652fb3",
    "model": "open_lm_1b",
    "creation_date": "2024_01_29-13_58_49",
    "eval_metrics": {
        "perplexity": 2.5960307319959006,
        "downstream_perpexity": {
            "mmlu": 1.8935597904420207,
            "hellaswag": 2.3686570432931155,
            "jeopardy_all": 2.149915898809131,
            "triviaqa_sm_sub": 2.4899230924050015,
            "gsm8k": 2.0084635281400125,
            "agi_eval_sat_math": 1.774504196101969,
            "aqua": 2.5212548124546905,
            "svamp": 2.6737673624356586,
            "bigbench_qa_wikidata": 3.717481727446635,
            "arc_easy": 2.647179996325111,
            "arc_challenge": 2.7229657576200093,
            "bigbench_misconceptions": 5.429594159670616,
            "copa": 2.54458992600441,
            "siqa": 1.5837384440393751,
            "commonsense_qa": 1.912755992742565,
            "piqa": 2.6700075824579814,
            "openbook_qa": 4.281978795528412,
            "bigbench_novel_concepts": 2.743645094335079,
            "bigbench_strange_stories": 3.2964326670800133,
            "bigbench_strategy_qa": 2.0078976134016506,
            "lambada_openai": 1.50437077595358,
            "winograd_wsc": 2.5422871658653565,
            "winogrande": 3.1108053946175156,
            "bigbench_conlang_translation": 2.1802063787855754,
            "bigbench_language_identification": 4.360496270391888,
            "bigbench_conceptual_combinations": 0.9976625066359066,
            "bigbench_elementary_math_qa": 3.91103002919466,
            "bigbench_dyck_languages": 5.719336134910583,
            "agi_eval_lsat_ar": 1.7558610216430997,
            "bigbench_cs_algorithms": 4.399707958192536,
            "bigbench_logical_deduction": 1.0393388773997625,
            "bigbench_operators": 5.7784984543209985,
            "bigbench_repeat_copy_logic": 1.5325254686176777,
            "simple_arithmetic_nospaces": 6.5921566123962405,
            "simple_arithmetic_withspaces": 6.340947534561157,
            "math_qa": 4.366309550797184,
            "logi_qa": 2.0118081990841166,
            "pubmed_qa_labeled": 3.283696923971176,
            "squad": 1.8833584123099876,
            "agi_eval_lsat_rc": 1.6920993132377737,
            "agi_eval_lsat_lr": 1.7508843716453104,
            "coqa": 2.3863822806304102,
            "bigbench_understanding_fables": 2.8959691574964572,
            "boolq": 3.087621776919117,
            "agi_eval_sat_en": 1.7602745479750401,
            "winogender_mc_female": 1.2795622795820236,
            "winogender_mc_male": 1.0326732099056244,
            "enterprise_pii_classification": 3.9523113372926333,
            "bbq": 0.2887903231678294,
            "human_eval_return_complex": 3.0168024123184325,
            "human_eval_return_simple": 6.31660168879741,
            "human_eval-0.5": 2.9159328079805142,
            "human_eval-0.25": 3.058158098197565,
            "human_eval-0.75": 3.0165334540169413,
            "human_eval": 3.1992829674627723,
            "processed_human_eval_cpp": 3.0742551258632114,
            "processed_human_eval_js": 2.9738942690011934
        },
        "icl": {
            "mmlu_zeroshot": 0.24113859680660985,
            "hellaswag_zeroshot": 0.5622385740280151,
            "jeopardy": 0.13641764149069785,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "svamp": 0.0,
            "bigbench_qa_wikidata": 0.5981988906860352,
            "arc_easy": 0.6132155060768127,
            "arc_challenge": 0.2994880676269531,
            "bigbench_misconceptions": 0.47031962871551514,
            "copa": 0.6800000071525574,
            "siqa": 0.48157626390457153,
            "commonsense_qa": 0.23505324125289917,
            "piqa": 0.7301414608955383,
            "openbook_qa": 0.3440000116825104,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.517241358757019,
            "bigbench_strategy_qa": 0.5120139718055725,
            "lambada_openai": 0.5427905917167664,
            "hellaswag": 0.5613423585891724,
            "winograd": 0.7509157657623291,
            "winogrande": 0.5840568542480469,
            "bigbench_conlang_translation": 0.018292682245373726,
            "bigbench_language_identification": 0.24500000476837158,
            "bigbench_conceptual_combinations": 0.223300963640213,
            "bigbench_elementary_math_qa": 0.2424004226922989,
            "bigbench_dyck_languages": 0.2800000011920929,
            "agi_eval_lsat_ar": 0.19130434095859528,
            "bigbench_cs_algorithms": 0.39848485589027405,
            "bigbench_logical_deduction": 0.2840000092983246,
            "bigbench_operators": 0.16190476715564728,
            "bigbench_repeat_copy_logic": 0.0,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.0010000000474974513,
            "math_qa": 0.2541066110134125,
            "logi_qa": 0.23655913770198822,
            "pubmed_qa_labeled": 0.4790000021457672,
            "squad": 0.3439924418926239,
            "agi_eval_lsat_rc": 0.2723880708217621,
            "agi_eval_lsat_lr": 0.24901960790157318,
            "coqa": 0.27583613991737366,
            "bigbench_understanding_fables": 0.2380952388048172,
            "boolq": 0.5611620545387268,
            "agi_eval_sat_en": 0.3009708821773529,
            "winogender_mc_female": 0.4833333194255829,
            "winogender_mc_male": 0.5166666507720947,
            "enterprise_pii_classification": 0.46745213866233826,
            "bbq": 0.4807875860821117,
            "mmlu_fewshot": 0.2481747476155298
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.205712064630587,
        "language understanding": 0.27453599865824946,
        "reading comprehension": 0.151416463565016,
        "safety": -0.025880152528936218,
        "symbolic problem solving": 0.09184949580277867,
        "world knowledge": 0.15145146005484172
    },
    "aggregated_centered_results": 0.15259046253158243,
    "aggregated_results": 0.3577386128061517,
    "rw_small": 0.5500268439451853,
    "95%_CI_above": 0.419630620456845,
    "99%_CI_above": 0.425956335372251,
    "model_uuid": "44c405a2-7773-4922-86bc-ce9795761782",
    "low_variance_datasets": 0.41343379897827454,
    "_filename": "exp_data/evals/evaluation_rw_pagerank_bucket_3_of_5-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124_heavy.json",
    "missing tasks": "['gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot', 'gpqa_main', 'gpqa_diamond']",
    "rw_small_centered": 0.22331109242132538,
    "95%_CI_above_centered": 0.22521818174772631,
    "99%_CI_above_centered": 0.2620397429876342,
    "low_variance_datasets_centered": 0.26283757982659295,
    "Core": 0.26283757982659295,
    "Extended": "N/A due to missing tasks: ['gsm8k_cot', 'agi_eval_sat_math_cot', 'aqua_cot', 'svamp_cot', 'gpqa_main', 'gpqa_diamond']"
}