{
    "name": "/mnt/task_runtime/bolt_exp_launcher/dcnlp/eval/heavy",
    "uuid": "833b333f-4cc3-4886-8b72-9181c5713030",
    "model": "open_lm_7b_swiglutorch",
    "creation_date": "2024_04_15-21_14_43",
    "eval_metrics": {
        "icl": {
            "mmlu_zeroshot": 0.307149011314961,
            "hellaswag_zeroshot": 0.7499502301216125,
            "jeopardy": 0.478182327747345,
            "triviaqa_sm_sub": 0.4933333396911621,
            "gsm8k": 0.03487490490078926,
            "agi_eval_sat_math": 0.027272727340459824,
            "aqua": 0.004081632476300001,
            "bigbench_qa_wikidata": 0.7080852389335632,
            "arc_easy": 0.7706229090690613,
            "arc_challenge": 0.494027316570282,
            "mmlu_fewshot": 0.4016447234572026,
            "bigbench_misconceptions": 0.621004581451416,
            "copa": 0.8500000238418579,
            "siqa": 0.6760491132736206,
            "commonsense_qa": 0.6027845740318298,
            "piqa": 0.8101196885108948,
            "openbook_qa": 0.4440000057220459,
            "bigbench_novel_concepts": 0.53125,
            "bigbench_strange_stories": 0.7471264600753784,
            "bigbench_strategy_qa": 0.5928353071212769,
            "lambada_openai": 0.7203570604324341,
            "hellaswag": 0.7625970840454102,
            "winograd": 0.860805869102478,
            "winogrande": 0.6890292167663574,
            "bigbench_conlang_translation": 0.060975611209869385,
            "bigbench_language_identification": 0.29179999232292175,
            "bigbench_conceptual_combinations": 0.43689319491386414,
            "bigbench_elementary_math_qa": 0.26273584365844727,
            "bigbench_dyck_languages": 0.1420000046491623,
            "agi_eval_lsat_ar": 0.24347825348377228,
            "bigbench_cs_algorithms": 0.4545454680919647,
            "bigbench_logical_deduction": 0.24400000274181366,
            "bigbench_operators": 0.27142858505249023,
            "bigbench_repeat_copy_logic": 0.125,
            "simple_arithmetic_nospaces": 0.03500000014901161,
            "simple_arithmetic_withspaces": 0.041999999433755875,
            "math_qa": 0.2544418275356293,
            "logi_qa": 0.29800307750701904,
            "pubmed_qa_labeled": 0.44200000166893005,
            "squad": 0.547965943813324,
            "agi_eval_lsat_rc": 0.33955225348472595,
            "agi_eval_lsat_lr": 0.29019609093666077,
            "coqa": 0.43116623163223267,
            "bigbench_understanding_fables": 0.380952388048172,
            "boolq": 0.7565749287605286,
            "agi_eval_sat_en": 0.28155338764190674,
            "winogender_mc_female": 0.46666666865348816,
            "winogender_mc_male": 0.5833333134651184,
            "enterprise_pii_classification": 0.5272459387779236,
            "bbq": 0.5012592483650554,
            "gpqa_main": 0.2299107164144516,
            "gpqa_diamond": 0.2373737394809723,
            "gsm8k_cot": 0.050796058028936386,
            "agi_eval_sat_math_cot": 0.04545454680919647,
            "aqua_cot": 0.008163264952600002,
            "svamp_cot": 0.17666666209697723
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.4613361703236585,
        "language understanding": 0.46264486715640174,
        "reading comprehension": 0.2919983729360658,
        "safety": 0.039252584630792786,
        "symbolic problem solving": 0.10968232524568496,
        "world knowledge": 0.31759146243508096
    },
    "aggregated_centered_results": 0.27747160612996724,
    "aggregated_results": 0.42962428915209644,
    "rw_small": 0.7183674375216166,
    "rw_small_centered": 0.5206000215825979,
    "95%_CI_above": 0.5443645617404541,
    "95%_CI_above_centered": 0.40108696033530317,
    "99%_CI_above": 0.5600855174271956,
    "99%_CI_above_centered": 0.44959269442249583,
    "low_variance_datasets": 0.5547509523955259,
    "low_variance_datasets_centered": 0.45750191701163706,
    "model_uuid": "6ed4a1fa-202c-4b2b-9e6e-fc718b6a8bf6",
    "_filename": "exp_data/evals/evaluation_rw_v2_cc_v3_f0.15_resiliparse_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_7b_swiglutorch-warm=5000-lr=0p0003-wd=0p33-cd=3e-05-bs=2048-mult=2-seed=124-tokens=275576422400_heavy.json",
    "missing tasks": "[]",
    "Core": 0.45750191701163706,
    "Extended": 0.27747160612996724
}