{
    "name": "../eval/heavy",
    "uuid": "adc6c53d-b026-4018-a9ae-a2520a79544e",
    "model": "deepseek-ai/deepseek-llm-7b-base",
    "creation_date": "2024_06_12-20_08_48",
    "eval_metrics": {
        "icl": {
            "mmlu_zeroshot": 0.41583755586230964,
            "hellaswag_zeroshot": 0.7541326284408569,
            "jeopardy": 0.48729313611984254,
            "bigbench_qa_wikidata": 0.7305743098258972,
            "arc_easy": 0.7664141654968262,
            "arc_challenge": 0.4889078438282013,
            "mmlu_fewshot": 0.48478867741007553,
            "bigbench_misconceptions": 0.5479452013969421,
            "copa": 0.8500000238418579,
            "siqa": 0.7430911064147949,
            "commonsense_qa": 0.687960684299469,
            "piqa": 0.8079434037208557,
            "openbook_qa": 0.4519999921321869,
            "bigbench_novel_concepts": 0.6875,
            "bigbench_strange_stories": 0.7586206793785095,
            "bigbench_strategy_qa": 0.5867190957069397,
            "lambada_openai": 0.7275373339653015,
            "hellaswag": 0.7698665857315063,
            "winograd": 0.901098906993866,
            "winogrande": 0.7063930630683899,
            "bigbench_conlang_translation": 0.1036585345864296,
            "bigbench_language_identification": 0.3562999963760376,
            "bigbench_conceptual_combinations": 0.5048543810844421,
            "bigbench_elementary_math_qa": 0.27903562784194946,
            "bigbench_dyck_languages": 0.36500000953674316,
            "agi_eval_lsat_ar": 0.2347826063632965,
            "bigbench_cs_algorithms": 0.48257574439048767,
            "bigbench_logical_deduction": 0.2540000081062317,
            "bigbench_operators": 0.4571428596973419,
            "bigbench_repeat_copy_logic": 0.21875,
            "simple_arithmetic_nospaces": 0.28700000047683716,
            "simple_arithmetic_withspaces": 0.28600001335144043,
            "math_qa": 0.2571237087249756,
            "logi_qa": 0.33794161677360535,
            "pubmed_qa_labeled": 0.5389999747276306,
            "squad": 0.6157048344612122,
            "agi_eval_lsat_rc": 0.43283581733703613,
            "agi_eval_lsat_lr": 0.3803921639919281,
            "coqa": 0.4668670892715454,
            "bigbench_understanding_fables": 0.5291005373001099,
            "boolq": 0.805504560470581,
            "agi_eval_sat_en": 0.5631067752838135,
            "winogender_mc_female": 0.5333333611488342,
            "winogender_mc_male": 0.4833333194255829,
            "enterprise_pii_classification": 0.6574373841285706,
            "bbq": 0.6143901781602339,
            "gpqa_main": 0.2232142835855484,
            "gpqa_diamond": 0.24747474491596222,
            "triviaqa_sm_sub": 0.5356666445732117,
            "gsm8k_cot": 0.15542078018188477,
            "agi_eval_sat_math_cot": 0.07727272808551788,
            "aqua_cot": 0.040816325694322586,
            "svamp_cot": 0.4099999964237213
        }
    },
    "missing tasks": "[]",
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.5137568535641588,
        "language understanding": 0.505397139702144,
        "reading comprehension": 0.43189101788754525,
        "safety": 0.14424712143161078,
        "symbolic problem solving": 0.2102817736361941,
        "world knowledge": 0.33516075214447333
    },
    "aggregated_centered_results": 0.35266382454612966,
    "aggregated_results": 0.4922577547190886,
    "rw_small": 0.7285587141911188,
    "rw_small_centered": 0.546704820025037,
    "95%_CI_above": 0.5965818718656317,
    "95%_CI_above_centered": 0.47014362795483955,
    "99%_CI_above": 0.6075168321961941,
    "99%_CI_above_centered": 0.5095457167672256,
    "low_variance_datasets": 0.596943171728741,
    "low_variance_datasets_centered": 0.5071607238451479,
    "_filename": "exp_data/evals/evaluation_deepseek-ai_deepseek-llm-7b-base.json",
    "Core": 0.5071607238451479,
    "Extended": 0.35266382454612966
}