{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "be40f190-83a9-4996-8443-5958ffdf28c5",
    "model": "d=1024_l=24_h=8",
    "creation_date": "2024_06_10-15_44_55",
    "eval_metrics": {
        "perplexity": 3.4892393787701925,
        "downstream_perpexity": {
            "mmlu": 2.3402317540544746,
            "hellaswag": 2.7079312433120477,
            "jeopardy_all": 2.4092290044450286,
            "triviaqa_sm_sub": 3.614148652076721,
            "gsm8k": 2.42890662682427,
            "agi_eval_sat_math": 2.8074744820594786,
            "aqua": 2.6453727021509286,
            "svamp": 2.9261979977289836,
            "bigbench_qa_wikidata": 4.88742874304696,
            "arc_easy": 2.735951861548504,
            "arc_challenge": 2.8961546437833903,
            "bigbench_misconceptions": 5.649274301311197,
            "copa": 2.939646129608154,
            "siqa": 2.3490034023961366,
            "commonsense_qa": 3.158675379288382,
            "piqa": 3.021727085372958,
            "openbook_qa": 4.455389550209046,
            "bigbench_novel_concepts": 3.068019077181816,
            "bigbench_strange_stories": 3.901618475201486,
            "bigbench_strategy_qa": 2.3165413280302074,
            "lambada_openai": 2.508547186610041,
            "winograd_wsc": 2.9224428912222167,
            "winogrande": 3.3972020034443795,
            "bigbench_conlang_translation": 2.4269419981212152,
            "bigbench_language_identification": 3.439557437086897,
            "bigbench_conceptual_combinations": 0.9180972842336859,
            "bigbench_elementary_math_qa": 4.322661999923878,
            "bigbench_dyck_languages": 6.2542695684432985,
            "agi_eval_lsat_ar": 1.9252178876296333,
            "bigbench_cs_algorithms": 6.2784129702683655,
            "bigbench_logical_deduction": 1.0495030900637308,
            "bigbench_operators": 5.676100746790568,
            "bigbench_repeat_copy_logic": 1.872254654765129,
            "simple_arithmetic_nospaces": 7.114774998188019,
            "simple_arithmetic_withspaces": 6.83606113910675,
            "math_qa": 3.664222873859744,
            "logi_qa": 2.020258598063948,
            "pubmed_qa_labeled": 8.667948567390441,
            "squad": 2.3971902094027175,
            "agi_eval_lsat_rc": 1.7348935199317648,
            "agi_eval_lsat_lr": 1.7206466539233338,
            "coqa": 2.474125535890122,
            "bigbench_understanding_fables": 3.11686221253935,
            "boolq": 4.589456104648952,
            "agi_eval_sat_en": 1.734597938732036,
            "winogender_mc_female": 1.472806986172994,
            "winogender_mc_male": 1.8929748912652333,
            "enterprise_pii_classification": 4.837675759227006,
            "bbq": 0.44604647572331346,
            "human_eval_return_complex": 3.3814621309595783,
            "human_eval_return_simple": 5.93392345067617,
            "human_eval-0.5": 3.37878368540508,
            "human_eval-0.25": 3.4721876571818098,
            "human_eval-0.75": 3.412041552183105,
            "human_eval": 3.554723940244535,
            "processed_human_eval_cpp": 3.931700431041836,
            "processed_human_eval_js": 3.503986053350495
        },
        "icl": {
            "mmlu_zeroshot": 0.2332045443748173,
            "hellaswag_zeroshot": 0.38289186358451843,
            "jeopardy": 0.06960848215967416,
            "triviaqa_sm_sub": 0.08266666531562805,
            "gsm8k_cot": 0.007581501267850399,
            "agi_eval_sat_math_cot": 0.004545454401522875,
            "aqua_cot": 0.008163264952600002,
            "svamp_cot": 0.0533333346247673,
            "bigbench_qa_wikidata": 0.4672014117240906,
            "arc_easy": 0.5829124450683594,
            "arc_challenge": 0.3139931857585907,
            "mmlu_fewshot": 0.2550113109642999,
            "bigbench_misconceptions": 0.4840182662010193,
            "copa": 0.699999988079071,
            "siqa": 0.5153531432151794,
            "commonsense_qa": 0.33251434564590454,
            "piqa": 0.6659412384033203,
            "openbook_qa": 0.3319999873638153,
            "bigbench_novel_concepts": 0.375,
            "bigbench_strange_stories": 0.517241358757019,
            "bigbench_strategy_qa": 0.49279162287712097,
            "lambada_openai": 0.33611488342285156,
            "hellaswag": 0.3869746923446655,
            "winograd": 0.5750916004180908,
            "winogrande": 0.5003946423530579,
            "bigbench_conlang_translation": 0.012195121496915817,
            "bigbench_language_identification": 0.25679999589920044,
            "bigbench_conceptual_combinations": 0.28155338764190674,
            "bigbench_elementary_math_qa": 0.2417190819978714,
            "bigbench_dyck_languages": 0.08500000089406967,
            "agi_eval_lsat_ar": 0.269565224647522,
            "bigbench_cs_algorithms": 0.4265151619911194,
            "bigbench_logical_deduction": 0.24533332884311676,
            "bigbench_operators": 0.12380952388048172,
            "bigbench_repeat_copy_logic": 0.03125,
            "simple_arithmetic_nospaces": 0.0,
            "simple_arithmetic_withspaces": 0.0020000000949949026,
            "math_qa": 0.24404960870742798,
            "logi_qa": 0.2549923062324524,
            "pubmed_qa_labeled": 0.5519999861717224,
            "squad": 0.1682119220495224,
            "agi_eval_lsat_rc": 0.2238806039094925,
            "agi_eval_lsat_lr": 0.2705882489681244,
            "coqa": 0.1386696696281433,
            "bigbench_understanding_fables": 0.27513226866722107,
            "boolq": 0.5675840973854065,
            "agi_eval_sat_en": 0.2572815418243408,
            "winogender_mc_female": 0.46666666865348816,
            "winogender_mc_male": 0.5,
            "enterprise_pii_classification": 0.4671575725078583,
            "bbq": 0.47864493185823614,
            "gpqa_main": 0.2388392835855484,
            "gpqa_diamond": 0.20202019810676575
        }
    },
    "missing tasks": "[]",
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.18988346038614284,
        "language understanding": 0.135446747983688,
        "reading comprehension": 0.11783188954812654,
        "safety": -0.04376541349020871,
        "symbolic problem solving": 0.06483158751595,
        "world knowledge": 0.10221543820792733
    },
    "aggregated_centered_results": 0.1012238363056858,
    "aggregated_results": 0.30105677299850536,
    "rw_small": 0.507962703704834,
    "rw_small_centered": 0.16154155570861195,
    "95%_CI_above": 0.36994229193184186,
    "95%_CI_above_centered": 0.16113259677464836,
    "99%_CI_above": 0.3684957358905155,
    "99%_CI_above_centered": 0.18819899570912943,
    "low_variance_datasets": 0.35059292557733984,
    "low_variance_datasets_centered": 0.18293909115614496,
    "model_uuid": "c2d72cba-4891-4e79-b8c9-a7e9bf02f78c",
    "_filename": "exp_data/evals/evaluation_fineweb_edu_sample_350BT-d=1024_l=24_h=8-warm=2000-lr=0p003-wd=0p033-cd=3e-05-bs=512-mult=1p0-seed=124-tokens=8232325120_heavy.json",
    "Core": 0.18293909115614496,
    "Extended": 0.1012238363056858
}