{
    "name": "/mnt/task_runtime/dcnlp/eval/heavy",
    "uuid": "308b376e-627f-4df9-9bae-6cc023798b0a",
    "model": "open_lm_1b",
    "creation_date": "2024_02_21-15_07_16",
    "eval_metrics": {
        "perplexity": 2.5996701538562776,
        "downstream_perpexity": {
            "mmlu": 2.4256527389646036,
            "hellaswag": 2.339948409773206,
            "jeopardy_all": 1.6722077390862682,
            "triviaqa_sm_sub": 2.7677235139608385,
            "gsm8k": 1.2159616886620452,
            "agi_eval_sat_math": 1.478326759555123,
            "aqua": 2.4404083169236475,
            "svamp": 3.142661836147308,
            "bigbench_qa_wikidata": 2.5566330119020786,
            "arc_easy": 2.377051307546972,
            "arc_challenge": 2.5167499624014713,
            "bigbench_misconceptions": 4.998936735876074,
            "copa": 2.36597012758255,
            "siqa": 0.8837831954385021,
            "commonsense_qa": 4.945458905995625,
            "piqa": 2.614479772025016,
            "openbook_qa": 4.258818511486053,
            "bigbench_novel_concepts": 2.531723231077194,
            "bigbench_strange_stories": 3.770082209302091,
            "bigbench_strategy_qa": 2.1509931565580955,
            "lambada_openai": 1.328848773406451,
            "winograd_wsc": 2.3323368720519237,
            "winogrande": 2.9970560199930243,
            "bigbench_conlang_translation": 1.8960470251920747,
            "bigbench_language_identification": 1.5844790131765787,
            "bigbench_conceptual_combinations": 1.276056075559079,
            "bigbench_elementary_math_qa": 5.602961072606861,
            "bigbench_dyck_languages": 4.372090922832489,
            "agi_eval_lsat_ar": 1.8072391069453695,
            "bigbench_cs_algorithms": 4.6334786299503214,
            "bigbench_logical_deduction": 0.9673524785041809,
            "bigbench_operators": 4.848845375151861,
            "bigbench_repeat_copy_logic": 1.3421018049120903,
            "simple_arithmetic_nospaces": 6.390493514060974,
            "simple_arithmetic_withspaces": 5.30465443944931,
            "math_qa": 1.803949096400267,
            "logi_qa": 1.812747441129201,
            "pubmed_qa_labeled": 7.3996360301971436,
            "squad": 2.19948295590555,
            "agi_eval_lsat_rc": 1.8088039573448806,
            "agi_eval_lsat_lr": 1.5509772532126482,
            "coqa": 1.7689418322189132,
            "bigbench_understanding_fables": 1.495853131410306,
            "boolq": 3.3790877972903237,
            "agi_eval_sat_en": 1.9974802304240107,
            "winogender_mc_female": 1.0067472289005914,
            "winogender_mc_male": 0.8252336164315541,
            "enterprise_pii_classification": 10.102633864468784,
            "bbq": 1.5853421796148648,
            "human_eval_return_complex": 1.5439004165919747,
            "human_eval_return_simple": 2.5055261947013237,
            "human_eval-0.5": 1.3799617813854683,
            "human_eval-0.25": 1.4823721138442434,
            "human_eval-0.75": 1.4460464447009853,
            "human_eval": 1.6136867629318703,
            "processed_human_eval_cpp": 1.8742666999745814,
            "processed_human_eval_js": 1.8668298597742872
        },
        "icl": {
            "mmlu_zeroshot": 0.2525812578305863,
            "hellaswag_zeroshot": 0.5764787793159485,
            "jeopardy": 0.29360346496105194,
            "triviaqa_sm_sub": 0.00033333332976326346,
            "gsm8k": 0.0,
            "agi_eval_sat_math": 0.004545454401522875,
            "aqua": 0.0,
            "bigbench_qa_wikidata": 0.5979036688804626,
            "arc_easy": 0.6510942578315735,
            "arc_challenge": 0.37457337975502014,
            "bigbench_misconceptions": 0.4611872136592865,
            "copa": 0.6899999976158142,
            "siqa": 0.48413512110710144,
            "commonsense_qa": 0.2080262154340744,
            "piqa": 0.737758457660675,
            "openbook_qa": 0.37400001287460327,
            "bigbench_novel_concepts": 0.5,
            "bigbench_strange_stories": 0.517241358757019,
            "bigbench_strategy_qa": 0.5264307260513306,
            "lambada_openai": 0.5847079157829285,
            "hellaswag": 0.577076256275177,
            "winograd": 0.7655677795410156,
            "winogrande": 0.5982636213302612,
            "bigbench_conlang_translation": 0.018292682245373726,
            "bigbench_language_identification": 0.2522999942302704,
            "bigbench_conceptual_combinations": 0.28155338764190674,
            "bigbench_elementary_math_qa": 0.23928196728229523,
            "bigbench_dyck_languages": 0.23100000619888306,
            "agi_eval_lsat_ar": 0.22173912823200226,
            "bigbench_cs_algorithms": 0.4545454680919647,
            "bigbench_logical_deduction": 0.23600000143051147,
            "bigbench_operators": 0.20952381193637848,
            "bigbench_repeat_copy_logic": 0.0625,
            "simple_arithmetic_nospaces": 0.009999999776482582,
            "simple_arithmetic_withspaces": 0.012000000104308128,
            "math_qa": 0.2531009018421173,
            "logi_qa": 0.22580644488334656,
            "pubmed_qa_labeled": 0.4000000059604645,
            "squad": 0.3895931839942932,
            "agi_eval_lsat_rc": 0.2798507511615753,
            "agi_eval_lsat_lr": 0.2666666805744171,
            "coqa": 0.31116122007369995,
            "bigbench_understanding_fables": 0.29100528359413147,
            "boolq": 0.608562707901001,
            "agi_eval_sat_en": 0.276699036359787,
            "winogender_mc_female": 0.5666666626930237,
            "winogender_mc_male": 0.44999998807907104,
            "enterprise_pii_classification": 0.4886597990989685,
            "bbq": 0.43649531223557214,
            "mmlu_fewshot": 0.24504713818692325,
            "gsm8k_cot": 0.0470053069293499,
            "agi_eval_sat_math_cot": 0.027272727340459824,
            "aqua_cot": 0.016326529905200005,
            "svamp_cot": 0.0533333346247673,
            "gpqa_main": 0.2142857164144516,
            "gpqa_diamond": 0.18686868250370026
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.2149770733661138,
        "language understanding": 0.3026986294802549,
        "reading comprehension": 0.17539479777935826,
        "safety": -0.029089118946682313,
        "symbolic problem solving": 0.08337470445288643,
        "world knowledge": 0.1380148804519524
    },
    "aggregated_centered_results": 0.1520561679345643,
    "aggregated_results": 0.34026616376453567,
    "rw_small": 0.5831621239582697,
    "95%_CI_above": 0.44202232176994344,
    "99%_CI_above": 0.4524495277715766,
    "low_variance_datasets": 0.4440899694507772,
    "model_uuid": "5569cbeb-880f-4664-ba51-2bd0aec15f2d",
    "_filename": "exp_data/evals/evaluation_rw_v2_fasttext_openhermes_vs_rw_v2_bigram_0.1-open_lm_1b-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124_oh_ft_heavy.json",
    "missing tasks": "[]",
    "rw_small_centered": 0.2804373872210408,
    "95%_CI_above_centered": 0.25436289251505423,
    "99%_CI_above_centered": 0.2973816020401446,
    "low_variance_datasets_centered": 0.30181386278145744,
    "Core": 0.30181386278145744,
    "Extended": 0.1520561679345643
}