{
    "rw_small": ["hellaswag_zeroshot", "arc_easy", "arc_challenge", "copa", "winogrande", "boolq"],
    "95%_CI_above": [
        "hellaswag_zeroshot",
        "jeopardy",
        "gsm8k",
        "svamp",
        "bigbench_qa_wikidata",
        "arc_easy",
        "arc_challenge",
        "copa",
        "commonsense_qa",
        "piqa",
        "openbook_qa",
        "bigbench_novel_concepts",
        "bigbench_strategy_qa",
        "lambada_openai",
        "hellaswag",
        "winograd",
        "winogrande",
        "bigbench_conceptual_combinations",
        "bigbench_dyck_languages",
        "agi_eval_lsat_ar",
        "bigbench_cs_algorithms",
        "bigbench_operators",
        "bigbench_repeat_copy_logic",
        "logi_qa",
        "pubmed_qa_labeled",
        "squad",
        "coqa",
        "boolq",
        "enterprise_pii_classification",
        "bbq"
    ],
    "99%_CI_above": [
        "hellaswag_zeroshot",
        "jeopardy",
        "gsm8k",
        "svamp",
        "bigbench_qa_wikidata",
        "arc_easy",
        "arc_challenge",
        "copa",
        "commonsense_qa",
        "piqa",
        "openbook_qa",
        "lambada_openai",
        "hellaswag",
        "winograd",
        "winogrande",
        "bigbench_dyck_languages",
        "agi_eval_lsat_ar",
        "bigbench_cs_algorithms",
        "bigbench_operators",
        "bigbench_repeat_copy_logic",
        "pubmed_qa_labeled",
        "squad",
        "coqa",
        "boolq",
        "enterprise_pii_classification"
    ],
    "low_variance_datasets": [
        "hellaswag_zeroshot",
        "jeopardy",
        "bigbench_qa_wikidata",
        "arc_easy",
        "arc_challenge",
        "copa",
        "commonsense_qa",
        "piqa",
        "openbook_qa",
        "lambada_openai",
        "hellaswag",
        "winograd",
        "winogrande",
        "bigbench_dyck_languages",
        "agi_eval_lsat_ar",
        "bigbench_cs_algorithms",
        "bigbench_operators",
        "bigbench_repeat_copy_logic",
        "squad",
        "coqa",
        "boolq",
        "bigbench_language_identification"
    ]
}

