{
    "name": "/opt/ml/code/eval/heavy",
    "uuid": "963f9bb2-b427-4619-b83d-64de89fc3800",
    "model": "open_lm_1b_swiglutorch",
    "creation_date": "2024_05_18-12_59_25",
    "eval_metrics": {
        "perplexity": 2.9441497524579368,
        "downstream_perpexity": {
            "mmlu": 1.6785569769452227,
            "hellaswag": 2.3061929680963504,
            "jeopardy_all": 1.4101378736937447,
            "triviaqa_sm_sub": 2.3444793033599853,
            "gsm8k": 1.811371666089073,
            "agi_eval_sat_math": 1.5215575234456495,
            "aqua": 2.214131338255746,
            "svamp": 2.5555253505706785,
            "bigbench_qa_wikidata": 3.4978772251338834,
            "arc_easy": 2.2211195251556357,
            "arc_challenge": 2.406703659673396,
            "bigbench_misconceptions": 4.314508916036179,
            "copa": 2.4329275131225585,
            "siqa": 1.2157185026420663,
            "commonsense_qa": 1.6178572913059732,
            "piqa": 2.634515910270554,
            "openbook_qa": 4.05194163274765,
            "bigbench_novel_concepts": 2.428507387638092,
            "bigbench_strange_stories": 3.210624211135952,
            "bigbench_strategy_qa": 1.938665633224514,
            "lambada_openai": 1.375071596424371,
            "winograd_wsc": 2.4361470374432237,
            "winogrande": 3.02861856266374,
            "bigbench_conlang_translation": 1.9936509270493576,
            "bigbench_language_identification": 3.062515533645288,
            "bigbench_conceptual_combinations": 0.7803630157581811,
            "bigbench_elementary_math_qa": 4.31328062863095,
            "bigbench_dyck_languages": 4.708422603130341,
            "agi_eval_lsat_ar": 1.6771405639855759,
            "bigbench_cs_algorithms": 5.505162508559949,
            "bigbench_logical_deduction": 0.9115141662756602,
            "bigbench_operators": 4.667010362943014,
            "bigbench_repeat_copy_logic": 1.344217313453555,
            "simple_arithmetic_nospaces": 6.83174223279953,
            "simple_arithmetic_withspaces": 6.274109111785888,
            "math_qa": 3.361830391901434,
            "logi_qa": 1.9877253651802074,
            "pubmed_qa_labeled": 5.90925972032547,
            "squad": 2.201547902174005,
            "agi_eval_lsat_rc": 2.003174931255739,
            "agi_eval_lsat_lr": 1.6932042402379652,
            "coqa": 2.5713748667199225,
            "bigbench_understanding_fables": 2.8730704923155446,
            "boolq": 2.8590329252616344,
            "agi_eval_sat_en": 2.028854044317042,
            "winogender_mc_female": 1.113565997282664,
            "winogender_mc_male": 0.9476634383201599,
            "enterprise_pii_classification": 5.898499206380044,
            "bbq": 0.307618697311143,
            "human_eval_return_complex": 2.530035728544701,
            "human_eval_return_simple": 5.773793143195075,
            "human_eval-0.5": 2.4814798003289757,
            "human_eval-0.25": 2.5981241421001715,
            "human_eval-0.75": 2.5407816019000076,
            "human_eval": 2.806084378463466,
            "processed_human_eval_cpp": 2.9263730715520633,
            "processed_human_eval_js": 2.512058747977745
        },
        "icl": {
            "mmlu_zeroshot": 0.25742497621921073,
            "hellaswag_zeroshot": 0.5988847017288208,
            "jeopardy": 0.27569612562656404,
            "triviaqa_sm_sub": 0.21066667139530182,
            "gsm8k_cot": 0.008339650928974152,
            "agi_eval_sat_math_cot": 0.013636363670229912,
            "aqua_cot": 0.020408162847161293,
            "bigbench_qa_wikidata": 0.6086806654930115,
            "arc_easy": 0.6729797720909119,
            "arc_challenge": 0.36006826162338257,
            "mmlu_fewshot": 0.25219114882904187,
            "bigbench_misconceptions": 0.4840182662010193,
            "copa": 0.7099999785423279,
            "siqa": 0.4754350185394287,
            "commonsense_qa": 0.36036035418510437,
            "piqa": 0.7437431812286377,
            "openbook_qa": 0.3919999897480011,
            "bigbench_novel_concepts": 0.59375,
            "bigbench_strange_stories": 0.517241358757019,
            "bigbench_strategy_qa": 0.4945391118526459,
            "lambada_openai": 0.5678245425224304,
            "hellaswag": 0.605855405330658,
            "winograd": 0.7692307829856873,
            "winogrande": 0.5927387475967407,
            "bigbench_conlang_translation": 0.024390242993831635,
            "bigbench_language_identification": 0.2563999891281128,
            "bigbench_conceptual_combinations": 0.3009708821773529,
            "bigbench_elementary_math_qa": 0.24829664826393127,
            "bigbench_dyck_languages": 0.1679999977350235,
            "agi_eval_lsat_ar": 0.186956524848938,
            "bigbench_cs_algorithms": 0.4318181872367859,
            "bigbench_logical_deduction": 0.24799999594688416,
            "bigbench_operators": 0.190476194024086,
            "bigbench_repeat_copy_logic": 0.03125,
            "simple_arithmetic_nospaces": 0.003000000026077032,
            "simple_arithmetic_withspaces": 0.004999999888241291,
            "math_qa": 0.24874287843704224,
            "logi_qa": 0.24884793162345886,
            "pubmed_qa_labeled": 0.4359999895095825,
            "squad": 0.37625354528427124,
            "agi_eval_lsat_rc": 0.25746268033981323,
            "agi_eval_lsat_lr": 0.2235294133424759,
            "coqa": 0.30978327989578247,
            "bigbench_understanding_fables": 0.24338623881340027,
            "boolq": 0.6275229454040527,
            "agi_eval_sat_en": 0.2864077687263489,
            "winogender_mc_female": 0.550000011920929,
            "winogender_mc_male": 0.5166666507720947,
            "enterprise_pii_classification": 0.5027982592582703,
            "bbq": 0.47606911984356964,
            "gpqa_main": 0.2254464328289032,
            "gpqa_diamond": 0.24747474491596222,
            "svamp_cot": 0.05999999865889549
        }
    },
    "aggregated_task_categories_centered": {
        "commonsense reasoning": 0.25332053476104,
        "language understanding": 0.31321583535702546,
        "reading comprehension": 0.16823133083648587,
        "safety": 0.022767020897431794,
        "symbolic problem solving": 0.07119654521694807,
        "world knowledge": 0.17505271102601325
    },
    "aggregated_centered_results": 0.1658053769762357,
    "aggregated_results": 0.3493710149016307,
    "rw_small": 0.5936990678310394,
    "rw_small_centered": 0.30019749849163296,
    "95%_CI_above": 0.45118208847843205,
    "95%_CI_above_centered": 0.26960196289106947,
    "99%_CI_above": 0.45734441008256826,
    "99%_CI_above_centered": 0.3076724024979692,
    "low_variance_datasets": 0.4471146896481514,
    "low_variance_datasets_centered": 0.3098557696948399,
    "model_uuid": "8d7ddce1-9bf3-4492-bf46-1e7dc31e74f6",
    "_filename": "exp_data/evals/evaluation_rw_v2_fasttext_openhermes_reddit_eli5_vs_rw_v2_bigram_200k_train_0.1-open_lm_1b_swiglutorch-warm=5000-lr=0p003-wd=0p033-cd=3e-05-bs=256-mult=1-seed=124-tokens=28795904000_heavy.json",
    "missing tasks": "[]",
    "Core": 0.3098557696948399,
    "Extended": 0.1658053769762357
}