{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "ed2a871d-0cd2-4012-a0c6-388348c345c0",
    "model": "open_lm_1b_swiglutorch",
    "creation_date": "2024_06_05-12_50_29",
    "eval_metrics": {
        "perplexity": 2.2949941833813985,
        "downstream_perpexity": {
            "mmlu": 1.6026220507574904,
            "hellaswag": 2.2245406657665097,
            "jeopardy_all": 1.270883410169541,
            "triviaqa_sm_sub": 1.9406631255969404,
            "gsm8k": 1.7019234549556381,
            "agi_eval_sat_math": 1.4024091352115977,
            "aqua": 2.1025752057834546,
            "svamp": 2.407715916633606,
            "bigbench_qa_wikidata": 3.2005944775179036,
            "arc_easy": 2.1029046593290386,
            "arc_challenge": 2.2599713063891023,
            "bigbench_misconceptions": 4.030196442451651,
            "copa": 2.3481277346611025,
            "siqa": 1.4045816853322173,
            "commonsense_qa": 1.8028764628075264,
            "piqa": 2.4203548935992933,
            "openbook_qa": 3.9402265815734863,
            "bigbench_novel_concepts": 2.430714450776577,
            "bigbench_strange_stories": 2.714119974224047,
            "bigbench_strategy_qa": 1.6728946540802523,
            "lambada_openai": 1.0717786163255767,
            "winograd_wsc": 2.3856463375545682,
            "winogrande": 2.9717158300253756,
            "bigbench_conlang_translation": 1.6692790687084198,
            "bigbench_language_identification": 1.5759538004197557,
            "bigbench_conceptual_combinations": 0.9403652554576837,
            "bigbench_elementary_math_qa": 3.675449622036651,
            "bigbench_dyck_languages": 3.6211249359846116,
            "agi_eval_lsat_ar": 1.5612848577292069,
            "bigbench_cs_algorithms": 3.0837838449261405,
            "bigbench_logical_deduction": 1.0357295056184133,
            "bigbench_operators": 4.364713208448319,
            "bigbench_repeat_copy_logic": 1.2216808144003153,
            "simple_arithmetic_nospaces": 6.7022878465652465,
            "simple_arithmetic_withspaces": 5.437165542602539,
            "math_qa": 1.8163568306960318,
            "logi_qa": 1.5557935367897726,
            "pubmed_qa_labeled": 4.023958699464798,
            "squad": 1.7587959242408413,
            "agi_eval_lsat_rc": 1.5090395123211306,
            "agi_eval_lsat_lr": 1.5495911133055593,
            "coqa": 1.3004590959293896,
            "bigbench_understanding_fables": 1.60969044481005,
            "boolq": 3.6480868346100555,
            "agi_eval_sat_en": 1.552854461577332,
            "winogender_mc_female": 1.3178717235724131,
            "winogender_mc_male": 1.1155936996142068,
            "enterprise_pii_classification": 3.037311421402775,
            "bbq": 0.229366142328864,
            "human_eval_return_complex": 1.11670389090936,
            "human_eval_return_simple": 2.8090936750979036,
            "human_eval-0.5": 1.0610162099687064,
            "human_eval-0.25": 1.0791434961848143,
            "human_eval-0.75": 1.0994411983141086,
            "human_eval": 1.1393478192934177,
            "processed_human_eval_cpp": 1.4784460467581424,
            "processed_human_eval_js": 1.354879800139404
        },
        "icl": {
            "mmlu_zeroshot": 0.2639620236137457,
            "hellaswag_zeroshot": 0.6602270603179932,
            "jeopardy": 0.3669795274734497,
            "triviaqa_sm_sub": 0.30300000309944153,
            "gsm8k_cot": 0.013646702282130718,
            "agi_eval_sat_math_cot": 0.0181818176060915,
            "aqua_cot": 0.004081632476300001,
            "svamp_cot": 0.0833333358168602,
            "bigbench_qa_wikidata": 0.639387845993042,
            "arc_easy": 0.7028619647026062,
            "arc_challenge": 0.4087030589580536,
            "mmlu_fewshot": 0.25798988028576497,
            "bigbench_misconceptions": 0.456620991230011,
            "copa": 0.800000011920929,
            "siqa": 0.4918116629123688,
            "commonsense_qa": 0.30794429779052734,
            "piqa": 0.7600653171539307,
            "openbook_qa": 0.4099999964237213,
            "bigbench_novel_concepts": 0.59375,
            "bigbench_strange_stories": 0.5862069129943848,
            "bigbench_strategy_qa": 0.5526430606842041,
            "lambada_openai": 0.6675722599029541,
            "hellaswag": 0.66729736328125,
            "winograd": 0.8498168587684631,
            "winogrande": 0.6164167523384094,
            "bigbench_conlang_translation": 0.024390242993831635,
            "bigbench_language_identification": 0.250900000333786,
            "bigbench_conceptual_combinations": 0.28155338764190674,
            "bigbench_elementary_math_qa": 0.25036686658859253,
            "bigbench_dyck_languages": 0.18400000035762787,
            "agi_eval_lsat_ar": 0.24782608449459076,
            "bigbench_cs_algorithms": 0.405303031206131,
            "bigbench_logical_deduction": 0.2553333342075348,
            "bigbench_operators": 0.21904762089252472,
            "bigbench_repeat_copy_logic": 0.125,
            "simple_arithmetic_nospaces": 0.00800000037997961,
            "simple_arithmetic_withspaces": 0.013000000268220901,
            "math_qa": 0.2567884624004364,
            "logi_qa": 0.25960060954093933,
            "pubmed_qa_labeled": 0.4909999966621399,
            "squad": 0.4771050214767456,
            "agi_eval_lsat_rc": 0.24253731966018677,
            "agi_eval_lsat_lr": 0.22549019753932953,
            "coqa": 0.3705374002456665,
            "bigbench_understanding_fables": 0.25925925374031067,
            "boolq": 0.6590213775634766,
            "agi_eval_sat_en": 0.20873786509037018,
            "winogender_mc_female": 0.38333332538604736,
            "winogender_mc_male": 0.44999998807907104,
            "enterprise_pii_classification": 0.49219441413879395,
            "bbq": 0.4404748569835316,
            "gpqa_main": 0.2388392835855484,
            "gpqa_diamond": 0.24747474491596222
        }
    },
    "missing tasks": "[]",
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.30954753143199976,
        "language understanding": 0.36822179973645935,
        "reading comprehension": 0.1931778172341486,
        "safety": -0.11699870770627802,
        "symbolic problem solving": 0.08814945570299546,
        "world knowledge": 0.20490506337748635
    },
    "aggregated_centered_results": 0.1862429212399159,
    "aggregated_results": 0.36697386838490403,
    "rw_small": 0.641205037633578,
    "rw_small_centered": 0.38298513491948444,
    "95%_CI_above": 0.48772604203262887,
    "95%_CI_above_centered": 0.31852458082675206,
    "99%_CI_above": 0.5012307505244794,
    "99%_CI_above_centered": 0.3657582384628662,
    "low_variance_datasets": 0.4907278568907218,
    "low_variance_datasets_centered": 0.36877082958329416,
    "model_uuid": "fe10ac65-32b5-48b0-8fbc-3ea1c996102e",
    "_filename": "exp_data/evals/evaluation_cc_v4_resiliparse_rw_v2_bff_minngram20_10shards_shard3_OH_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_1b_swiglutorch-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=5p0-seed=124-tokens=143979520000_heavy.json",
    "Core": 0.36877082958329416,
    "Extended": 0.1862429212399159
}