{
    "name": "/home/ubuntu/research/openlm/dcnlp/eval/heavy",
    "uuid": "c45932dd-26ce-4e8f-8aab-f34f1d658479",
    "model": "open_lm_1b_swiglutorch",
    "creation_date": "2024_04_26-20_44_45",
    "eval_metrics": {
        "perplexity": 2.7481574018796286,
        "downstream_perpexity": {
            "mmlu": 1.6429768863048262,
            "hellaswag": 2.3230459476177305,
            "jeopardy_all": 1.5867244721754126,
            "triviaqa_sm_sub": 2.327077808837096,
            "gsm8k": 1.8128092864682224,
            "agi_eval_sat_math": 1.5610863149166108,
            "aqua": 2.296136424979385,
            "svamp": 2.6462817589441934,
            "bigbench_qa_wikidata": 3.556388592760069,
            "arc_easy": 2.3260767318885334,
            "arc_challenge": 2.463615289574597,
            "bigbench_misconceptions": 4.708744525909424,
            "copa": 2.4786342966556547,
            "siqa": 1.434481920947072,
            "commonsense_qa": 1.6613447805782695,
            "piqa": 2.517713701245056,
            "openbook_qa": 4.060432768344879,
            "bigbench_novel_concepts": 2.494875468313694,
            "bigbench_strange_stories": 3.0460802481092255,
            "bigbench_strategy_qa": 1.6848811414784457,
            "lambada_openai": 1.2768896224229151,
            "winograd_wsc": 2.477454281770266,
            "winogrande": 3.0465051666807734,
            "bigbench_conlang_translation": 2.0232757090068443,
            "bigbench_language_identification": 1.5856890225439078,
            "bigbench_conceptual_combinations": 1.0646116907156786,
            "bigbench_elementary_math_qa": 5.10018704829101,
            "bigbench_dyck_languages": 3.8993314712047575,
            "agi_eval_lsat_ar": 1.7063001601592354,
            "bigbench_cs_algorithms": 3.3804985788735475,
            "bigbench_logical_deduction": 1.024000820438067,
            "bigbench_operators": 5.14864182472229,
            "bigbench_repeat_copy_logic": 1.3891711458563805,
            "simple_arithmetic_nospaces": 6.769995848655701,
            "simple_arithmetic_withspaces": 5.727429147720337,
            "math_qa": 1.645394386879118,
            "logi_qa": 1.8284729794240036,
            "pubmed_qa_labeled": 3.24262331533432,
            "squad": 2.0028587534107345,
            "agi_eval_lsat_rc": 1.9788973731781119,
            "agi_eval_lsat_lr": 1.8391453850503061,
            "coqa": 1.7118223209690513,
            "bigbench_understanding_fables": 1.592772297758274,
            "boolq": 2.4301325653671126,
            "agi_eval_sat_en": 2.0373707697229477,
            "winogender_mc_female": 1.078681825598081,
            "winogender_mc_male": 0.874386074145635,
            "enterprise_pii_classification": 5.062670087884557,
            "bbq": 0.30878005930948177,
            "human_eval_return_complex": 1.2991809905983331,
            "human_eval_return_simple": 2.3685022302576013,
            "human_eval-0.5": 1.2124423391935302,
            "human_eval-0.25": 1.236578090161812,
            "human_eval-0.75": 1.236605563905181,
            "human_eval": 1.3081663654344837,
            "processed_human_eval_cpp": 1.39163024825339,
            "processed_human_eval_js": 1.3453446597587773
        },
        "icl": {
            "mmlu_zeroshot": 0.25485974837813463,
            "hellaswag_zeroshot": 0.5828520059585571,
            "jeopardy": 0.24892502427101135,
            "triviaqa_sm_sub": 0.19833333790302277,
            "gsm8k": 0.007581501267850399,
            "agi_eval_sat_math": 0.0181818176060915,
            "aqua": 0.004081632476300001,
            "bigbench_qa_wikidata": 0.5857979655265808,
            "arc_easy": 0.6523569226264954,
            "arc_challenge": 0.3387371897697449,
            "mmlu_fewshot": 0.2580224897777825,
            "bigbench_misconceptions": 0.49771690368652344,
            "copa": 0.7099999785423279,
            "siqa": 0.515864908695221,
            "commonsense_qa": 0.2325962334871292,
            "piqa": 0.7519042491912842,
            "openbook_qa": 0.3840000033378601,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.5919540524482727,
            "bigbench_strategy_qa": 0.523809552192688,
            "lambada_openai": 0.6060547232627869,
            "hellaswag": 0.5846444964408875,
            "winograd": 0.761904776096344,
            "winogrande": 0.5966851115226746,
            "bigbench_conlang_translation": 0.012195121496915817,
            "bigbench_language_identification": 0.25529998540878296,
            "bigbench_conceptual_combinations": 0.25242719054222107,
            "bigbench_elementary_math_qa": 0.24004192650318146,
            "bigbench_dyck_languages": 0.23100000619888306,
            "agi_eval_lsat_ar": 0.2652173936367035,
            "bigbench_cs_algorithms": 0.4416666626930237,
            "bigbench_logical_deduction": 0.24666666984558105,
            "bigbench_operators": 0.1666666716337204,
            "bigbench_repeat_copy_logic": 0.03125,
            "simple_arithmetic_nospaces": 0.0020000000949949026,
            "simple_arithmetic_withspaces": 0.003000000026077032,
            "math_qa": 0.24840764701366425,
            "logi_qa": 0.2549923062324524,
            "pubmed_qa_labeled": 0.4959999918937683,
            "squad": 0.3978240191936493,
            "agi_eval_lsat_rc": 0.25,
            "agi_eval_lsat_lr": 0.2450980395078659,
            "coqa": 0.30915695428848267,
            "bigbench_understanding_fables": 0.2222222238779068,
            "boolq": 0.6073394417762756,
            "agi_eval_sat_en": 0.24271844327449799,
            "winogender_mc_female": 0.44999998807907104,
            "winogender_mc_male": 0.550000011920929,
            "enterprise_pii_classification": 0.4980854094028473,
            "bbq": 0.49446706067432056,
            "gpqa_main": 0.2008928507566452,
            "gpqa_diamond": 0.19696970283985138,
            "gsm8k_cot": 0.008339650928974152,
            "agi_eval_sat_math_cot": 0.022727273404598236,
            "aqua_cot": 0.016326529905200005,
            "svamp_cot": 0.05000000074505806
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.25298848882832303,
        "language understanding": 0.30117615866777053,
        "reading comprehension": 0.1633256617279952,
        "safety": -0.003723764961416076,
        "symbolic problem solving": 0.08018902381775757,
        "world knowledge": 0.15642753406052004
    },
    "aggregated_centered_results": 0.16022902097770964,
    "aggregated_results": 0.3450192235266315,
    "rw_small": 0.5813284416993459,
    "rw_small_centered": 0.2797746243532638,
    "95%_CI_above": 0.44665576215688285,
    "95%_CI_above_centered": 0.26279057130652833,
    "99%_CI_above": 0.4556810969891756,
    "99%_CI_above_centered": 0.3027784783219809,
    "low_variance_datasets": 0.4428127188574184,
    "low_variance_datasets_centered": 0.30238555679371065,
    "model_uuid": "2953d72c-df3c-4948-805d-acf0e95d45db",
    "_filename": "exp_data/evals/evaluation_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_minhash.b15.r93_substr-open_lm_1b_swiglutorch-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.30238555679371065,
    "Extended": 0.16022902097770964
}