import os import tempfile from typing import Iterator, st, teral, Optional, Union import pandas as pd import ray from datasets import Dataset, concatenate_datasets from tqdm import tqdm from engine.operators.operator import (  DatasetRefs,  ExecutionContext,  ManyShardRefsGenerator,  Operator,  OperatorSpecificConfig, ) class MergeOperatorConfig(OperatorSpecificConfig):  """  Configuration class for merge operators.  Attributes:  type (teral["merge"]): The type of the operator, always set to "merge".  function (str): The name or identifier of the function.  function_config (Dict[str, Any]): Additional configuration for the function.  sharded (bool): Indicates whether the function can operate across only a shard  num_shards (int): The number of shards if the function is sharded.  input_dataset_map (Dict[str, str]): Mapping of function argument names to input datasets from previous operators  """  type: teral["merge"] = "merge"  join_column: str  fill_value: Optional[Union[str, int, float]] = None  escapechar: Optional[str] = None  chunk_size: Optional[int] = 10000 class MergeOperator(Operator):  """  Operator that joins multiple datasets g a given column  """  def __init__(  self,  id: str,  input_ids: st[str],  config: MergeOperatorConfig,  execution_context: ExecutionContext,  ):  """  Initiaze the MergeOperator.  Args:  id (str): Unique identifier for the operator.  input_ids (st[str]): st of input identifiers for the operator.  config (MergeOperatorConfig): Specific configuration for the Merge operator.  execution_context (ExecutionContext): Execution context for the operator.  remote_kwargs (Dict): Keyword argument to be passed into ray remote call  """  per().__init__(id, input_ids, config, execution_context)  def compute(self, inputs: DatasetRefs) -> ManyShardRefsGenerator:  """  Execute the function operator on the input datasets.  Args:  inputs (DatasetRefs): Map of input datasets to apply function on  Returns:  ManyShardRefsGenerator: Generator of shards outputted by the function  """  input_datasets = {key: [] for key in inputs.keys()}  for key in inputs.keys():  input_datasets[key] = st(inputs[key])  gen_shards = self.join_datasets.options(name="merge").remote(  input_datasets,  self.config.join_column,  self.config.fill_value,  self.config.chunk_size,  self.config.escapechar,  )  yield gen_shards  @staticmethod  @ray.remote  def join_datasets(  dataset_refs: DatasetRefs,  join_column: str,  fill_value: Optional[Union[str, int, float]] = None,  chunk_size=10000,  escapechar=None,  ) -> Iterator[Dataset]:  dataset_refs_keys = st(dataset_refs.keys())  relt_df = concatenate_datasets(  ray.get(dataset_refs[dataset_refs_keys[0]])  ).to_pandas()  existing_columns = set(relt_df.columns)  datasets = []  for i in range(1, len(dataset_refs_keys)):  for dataset_obj in dataset_refs[dataset_refs_keys[i]]:  datasets.append(dataset_obj)  if len(datasets) == 0:  raise ValueError("Need at least 2 datasets to perform join")  num_datasets = len(datasets)  for i in range(num_datasets):  new_df = ray.get(datasets[i]).to_pandas()  new_columns = [  col  for col in new_df.columns  if col not in existing_columns and col != join_column  ]  with tempfile.TemporaryDirectory() as tmpdirname:  print("created temporary directory", tmpdirname)  if new_columns:  df_bset = new_df[[join_column] + new_columns]  length_of_df_bset = len(df_bset)  if escapechar:  df_bset.to_csv(  os.path.join(tmpdirname, "temp.csv"),  index=False,  escapechar="\\",  )  else:  df_bset.to_csv(  os.path.join(tmpdirname, "temp.csv"), index=False  )  del df_bset  # Add new columns to relt_df if they don't exist  for col in new_columns:  if col not in relt_df.columns:  relt_df[col] = None  reader = pd.read_csv(  os.path.join(tmpdirname, "temp.csv"), chunksize=chunk_size  )  for chunk in tqdm(  reader, total=int(length_of_df_bset / chunk_size) + 1  ):  # Create a mapping from join_column to new values  for col in new_columns:  # Update only null values in relt_df using the mapping  mask = relt_df[col].isna()  if mask.any():  # Create a mapping dictionary for this column  value_map = dict(zip(chunk[join_column], chunk[col]))  # Update only null values using the mapping  relt_df.loc[  mask  & relt_df[join_column].isin(chunk[join_column]),  col,  ] = relt_df.loc[  mask  & relt_df[join_column].isin(chunk[join_column]),  join_column,  ].map(  value_map  )  existing_columns.update(new_columns)  if fill_value is not None:  relt_df = relt_df.fillna(fill_value)  return Dataset.from_pandas(relt_df) 