[
  {
    "name": "MMLU",
    "path": "data/evaluation/mmlu.csv",
    "size": "~25MB",
    "description": "57 academic subjects for testing knowledge",
    "samples": "~15,000 questions",
    "relevance": "Core benchmark for measuring knowledge retention across training iterations"
  },
  {
    "name": "HellaSwag",
    "path": "data/evaluation/hellaswag.csv",
    "size": "~35MB",
    "description": "Commonsense reasoning completion tasks",
    "samples": "~70,000 examples",
    "relevance": "Tests if model maintains reasoning ability through iterations"
  },
  {
    "name": "ARC",
    "path": "data/evaluation/arc.csv",
    "size": "~5MB",
    "description": "Science exam questions requiring reasoning",
    "samples": "~7,787 questions",
    "relevance": "Measures reasoning degradation in scientific domains"
  },
  {
    "name": "GSM8K",
    "path": "data/reasoning/gsm8k.csv",
    "size": "~8MB",
    "description": "Grade school math word problems",
    "samples": "~8,500 problems",
    "relevance": "Critical for measuring mathematical reasoning preservation"
  },
  {
    "name": "HumanEval",
    "path": "data/coding/humaneval.csv",
    "size": "~1MB",
    "description": "Python programming problems",
    "samples": "164 problems",
    "relevance": "Tests code generation quality through iterations"
  },
  {
    "name": "MBPP",
    "path": "data/coding/mbpp.csv",
    "size": "~3MB",
    "description": "Basic Python programming problems",
    "samples": "~1,000 problems",
    "relevance": "Complementary coding benchmark"
  },
  {
    "name": "TruthfulQA",
    "path": "data/knowledge/truthfulqa.csv",
    "size": "~2MB",
    "description": "Questions testing truthfulness vs falsehoods",
    "samples": "~817 questions",
    "relevance": "Critical for measuring factual accuracy preservation"
  },
  {
    "name": "WinoGrande",
    "path": "data/knowledge/winogrande.csv",
    "size": "~10MB",
    "description": "Commonsense reasoning with pronouns",
    "samples": "~44,000 examples",
    "relevance": "Tests commonsense reasoning stability"
  },
  {
    "name": "SuperGLUE-BOOLQ",
    "path": "data/raw/boolq.zip",
    "size": "~1-5MB",
    "description": "SuperGLUE boolq task",
    "samples": "Variable",
    "relevance": "Language understanding benchmark"
  },
  {
    "name": "SuperGLUE-COPA",
    "path": "data/raw/copa.zip",
    "size": "~1-5MB",
    "description": "SuperGLUE copa task",
    "samples": "Variable",
    "relevance": "Language understanding benchmark"
  },
  {
    "name": "SuperGLUE-RTE",
    "path": "data/raw/rte.zip",
    "size": "~1-5MB",
    "description": "SuperGLUE rte task",
    "samples": "Variable",
    "relevance": "Language understanding benchmark"
  },
  {
    "name": "toxigen/toxigen-data",
    "path": "data/safety/toxigen.csv",
    "samples": 940,
    "columns": 13,
    "size_mb": 0.3330802917480469
  },
  {
    "name": "tau/commonsense_qa",
    "path": "data/common_sense/commonsense_qa.csv",
    "samples": 1140,
    "columns": 5,
    "size_mb": 0.2511606216430664
  },
  {
    "name": "race",
    "path": "data/evaluation/race.csv",
    "samples": 3000,
    "columns": 5,
    "size_mb": 5.7051286697387695
  },
  {
    "name": "facebook/xnli",
    "path": "data/multilingual/xnli_en.csv",
    "samples": 2500,
    "columns": 3,
    "size_mb": 0.3899078369140625
  },
  {
    "name": "squad",
    "path": "data/evaluation/squad.csv",
    "samples": 2000,
    "columns": 5,
    "size_mb": 1.9394340515136719
  },
  {
    "name": "squad_v2",
    "path": "data/evaluation/squad_v2.csv",
    "samples": 2000,
    "columns": 5,
    "size_mb": 1.8907356262207031
  }
]