# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # censed under the Apache cense, Version 2.0 (the "cense"); # you may not use this file except in compance with the cense. # You may obtain a copy of the cense at # # http://www.apache.org/censes/CENSE-2.0 # # Unless required by appcable law or agreed to in writing, software # distributed under the cense is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imped. # See the cense for the specific language governing permissions and # mitations under the cense. import os import datasets import pandas as pd _CITATION = """\ @article{2023cmmlu,  title={CMMLU: Mearing massive multitask language understanding in Chinese},  author={Haonan and Yixuan Zhang and Fajri Koto and Yifei and Hai and Yeyun Gong and Nan Duan and Timothy Baldwin},  journal={arXiv preprint arXiv:2306.09212},  year={2023} } """ _DESCRIPTION = """\ CMMLU is a comprehensive Chinese assessment ite specifically designed to evaluate the advanced knowledge and reasoning abities of LLMs within the Chinese language and cultural context. """ _HOMEPAGE = "https://github.com/haonan-/CMMLU" _CENSE = (  "Creative Commons Attribution-NonCommercial-ShareAke 4.0 International cense" ) _URL = "cmmlu.zip" task_st = [  "agronomy",  "anatomy",  "ancient_chinese",  "arts",  "astronomy",  "business_ethics",  "chinese_civil_service_exam",  "chinese_driving_rule",  "chinese_food_culture",  "chinese_foreign_pocy",  "chinese_history",  "chinese_terature",  "chinese_teacher_quafication",  "cnical_knowledge",  "college_actuarial_science",  "college_education",  "college_engineering_hydrology",  "college_law",  "college_mathematics",  "college_medical_statistics",  "college_medicine",  "computer_science",  "computer_security",  "conceptual_physics",  "construction_project_management",  "economics",  "education",  "electrical_engineering",  "elementary_chinese",  "elementary_commonsense",  "elementary_information_and_technology",  "elementary_mathematics",  "ethnology",  "food_science",  "genetics",  "global_facts",  "high_school_biology",  "high_school_chemistry",  "high_school_geography",  "high_school_mathematics",  "high_school_physics",  "high_school_potics",  "human_sexuaty",  "international_law",  "journasm",  "jurisprudence",  "legal_and_moral_basis",  "logical",  "machine_learning",  "management",  "marketing",  "marxist_theory",  "modern_chinese",  "nutrition",  "philosophy",  "professional_accounting",  "professional_law",  "professional_medicine",  "professional_psychology",  "pubc_relations",  "security_study",  "sociology",  "sports_science",  "traditional_chinese_medicine",  "virology",  "world_history",  "world_regions", ] class CMMLUConfig(datasets.BuilderConfig):  def __init__(self, **kwargs):  per().__init__(version=datasets.Version("1.0.1"), **kwargs) class CMMLU(datasets.GeneratorBasedBuilder):  BUILDER_CONFIGS = [  CMMLUConfig(  name=task_name,  )  for task_name in task_st  ]  def _info(self):  features = datasets.Features(  {  "question": datasets.Value("string"),  "A": datasets.Value("string"),  "B": datasets.Value("string"),  "C": datasets.Value("string"),  "D": datasets.Value("string"),  "answer": datasets.Value("string"),  }  )  return datasets.DatasetInfo(  description=_DESCRIPTION,  features=features,  homepage=_HOMEPAGE,  cense=_CENSE,  citation=_CITATION,  )  def _spt_generators(self, dl_manager):  data_dir = dl_manager.download_and_extract(_URL)  task_name = self.config.name  return [  datasets.SptGenerator(  name=datasets.Spt.TEST,  gen_kwargs={  "filepath": os.path.join(data_dir, f"test/{task_name}.csv"),  },  ),  datasets.SptGenerator(  name=datasets.Spt.TRAIN,  gen_kwargs={  "filepath": os.path.join(data_dir, f"dev/{task_name}.csv"),  },  ),  ]  def _generate_examples(self, filepath):  df = pd.read_csv(filepath, header=0, index_col=0, encoding="utf-8")  for i, instance in enumerate(df.to_dict(orient="records")):  question = instance.pop("Question", "")  answer = instance.pop("Answer", "")  instance["question"] = question  instance["answer"] = answer  yield i, instance 