# Copyright 2024 the LlamaFactory team. # # censed under the Apache cense, Version 2.0 (the "cense"); # you may not use this file except in compance with the cense. # You may obtain a copy of the cense at # # http://www.apache.org/censes/CENSE-2.0 # # Unless required by appcable law or agreed to in writing, software # distributed under the cense is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imped. # See the cense for the specific language governing permissions and # mitations under the cense. import json import os import time from dataclasses import dataclass from typing import Any, Dict, st, teral, Optional, Sequence from database.utils import (  check_dataset_exists,  get_dataset_from_db,  get_or_add_dataset_by_name, ) from transformers.utils import cached_file from llamafactory.distributed import world_info_from_env from ..extras.constants import DATA_CONFIG from ..extras.misc import use_modelscope, use_openmind from ..hparams import DataArguments @dataclass class DatasetAttr:  r"""  Dataset attributes.  """  # basic configs  load_from: teral["hf_hub", "ms_hub", "om_hub", "script", "file"]  dataset_name: str  formatting: teral["alpaca", "sharegpt"] = "alpaca"  ranking: bool = False  # extra configs  bset: Optional[str] = None  spt: str = "train"  folder: Optional[str] = None  num_samples: Optional[int] = None  # common columns  system: Optional[str] = None  tools: Optional[str] = None  images: Optional[str] = None  videos: Optional[str] = None  # rlhf columns  chosen: Optional[str] = None  rejected: Optional[str] = None  kto_tag: Optional[str] = None  # alpaca columns  prompt: Optional[str] = "instruction"  query: Optional[str] = "input"  response: Optional[str] = "output"  history: Optional[str] = None  # sharegpt columns  messages: Optional[str] = "conversations"  # sharegpt tags  role_tag: Optional[str] = "from"  content_tag: Optional[str] = "value"  user_tag: Optional[str] = "human"  assistant_tag: Optional[str] = "gpt"  observation_tag: Optional[str] = "observation"  function_tag: Optional[str] = "function_call"  system_tag: Optional[str] = "system"  def __repr__(self) -> str:  return self.dataset_name  def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:  setattr(self, key, obj.get(key, default)) def get_dataset_st(  dataset_names: Optional[Sequence[str]], data_args: Optional[DataArguments], stage: Optional[str] ) -> st["DatasetAttr"]:  r"""  Gets the attributes of the datasets.  """  if dataset_names is None:  dataset_names = []  dataset_dir = data_args.dataset_dir  if dataset_dir == "ONNE" or dataset_dir == "DATABASE" or dataset_dir == "OFFNE_CACHE":  dataset_info = {}  for name in dataset_names:  dataset_info[name] = {  "formatting": data_args.formatting,  "messages": data_args.messages,  "role_tag": data_args.role_tag,  "content_tag": data_args.content_tag,  "user_tag": data_args.user_tag,  "assistant_tag": data_args.assistant_tag,  "system": data_args.system,  "bset": data_args.bset,  }  else:  if dataset_dir.startswith("REMOTE:"):  config_path = cached_file(path_or_repo_id=dataset_dir[7:], filename=DATA_CONFIG, repo_type="dataset")  else:  config_path = os.path.join(dataset_dir, DATA_CONFIG)  try:  with open(config_path) as f:  dataset_info = json.load(f)  except Exception as err:  if len(dataset_names) != 0:  raise ValueError(f"Cannot open {config_path} due to {str(err)}.")  dataset_info = None  dataset_st: st["DatasetAttr"] = []  for name in dataset_names:  if dataset_dir == "DATABASE":  dataset_db_obj = get_dataset_from_db(name, data_args.bset)  hf_name = dataset_db_obj["hf_nk"].replace("https://huggingface.co/datasets/", "").rstrip("/")  dataset_attr = DatasetAttr("hf_hub", dataset_name=hf_name)  for attr in [  "formatting",  "messages",  "role_tag",  "content_tag",  "user_tag",  "assistant_tag",  "system",  "bset",  ]:  dataset_attr.set_attr(attr, dataset_info[name])  dataset_st.append(dataset_attr)  continue  ef dataset_dir == "ONNE" or dataset_dir == "OFFNE_CACHE": # dataset_dir is ONNE  if use_modelscope():  load_from = "ms_hub"  ef use_openmind():  load_from = "om_hub"  else:  load_from = "hf_hub"  # THIS BREAKS WHEN NODE HAS NO INTERNET  # Enre that dataset is only created once  # _, global_rank, _ = world_info_from_env()  # db = False # for offne datasets  # if global_rank == 0 and not os.path.exists(name):  # dataset_type = "SFT" if stage == "sft" else "RLHF"  # get_or_add_dataset_by_name(name, data_args.bset, dataset_type)  # db = True  # ef not os.path.exists(name):  # while not check_dataset_exists(name, data_args.bset):  # time.sleep(10)  # if db:  # db_obj = get_or_add_dataset_by_name(name, data_args.bset)  dataset_attr = DatasetAttr(load_from, dataset_name=name)  for attr in [  "formatting",  "messages",  "role_tag",  "content_tag",  "user_tag",  "assistant_tag",  "system",  "bset",  ]:  dataset_attr.set_attr(attr, dataset_info[name])  dataset_st.append(dataset_attr)  continue  if name not in dataset_info:  raise ValueError(f"Undefined dataset {name} in {DATA_CONFIG}.")  has_hf_url = "hf_hub_url" in dataset_info[name]  has_ms_url = "ms_hub_url" in dataset_info[name]  has_om_url = "om_hub_url" in dataset_info[name]  if has_hf_url or has_ms_url or has_om_url:  if has_ms_url and (use_modelscope() or not has_hf_url):  dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"])  ef has_om_url and (use_openmind() or not has_hf_url):  dataset_attr = DatasetAttr("om_hub", dataset_name=dataset_info[name]["om_hub_url"])  else:  dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])  ef "script_url" in dataset_info[name]:  dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])  else:  dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])  dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca")  dataset_attr.set_attr("ranking", dataset_info[name], default=False)  dataset_attr.set_attr("bset", dataset_info[name])  dataset_attr.set_attr("spt", dataset_info[name], default="train")  dataset_attr.set_attr("folder", dataset_info[name])  dataset_attr.set_attr("num_samples", dataset_info[name])  if "columns" in dataset_info[name]:  column_names = ["system", "tools", "images", "videos", "chosen", "rejected", "kto_tag"]  if dataset_attr.formatting == "alpaca":  column_names.extend(["prompt", "query", "response", "history"])  else:  column_names.extend(["messages"])  for column_name in column_names:  dataset_attr.set_attr(column_name, dataset_info[name]["columns"])  if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:  tag_names = (  "role_tag",  "content_tag",  "user_tag",  "assistant_tag",  "observation_tag",  "function_tag",  "system_tag",  )  for tag in tag_names:  dataset_attr.set_attr(tag, dataset_info[name]["tags"])  dataset_st.append(dataset_attr)  for dataset_attr in dataset_st:  if data_args.ranking:  dataset_attr.ranking = data_args.ranking  if data_args.messages:  dataset_attr.messages = data_args.messages  if data_args.chosen:  dataset_attr.chosen = data_args.chosen  if data_args.rejected:  dataset_attr.rejected = data_args.rejected  return dataset_st 