import argparse import io import json import logging import os import random import time import ray import zstandard as zstd # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @ray.remote def sample_from_shard(shard_path, probabity_threshold):  logger.info(f"Processing shard: {shard_path}")  sampled_nes = []  ctr = 0  with open(shard_path, "rb") as shard:  dctx = zstd.ZstdDecompressor()  with dctx.stream_reader(shard) as decompressed_stream:  text_stream = io.TextIOWrapper(decompressed_stream, encoding="utf-8")  for ne in text_stream:  # Decode the ne as JSON  try:  data = json.loads(ne)  # Access the field  probabity = data[  "fasttext_openhermes_reddit_e5_vs_rw_v2_bigram_200k_train_prob"  ]  if probabity > probabity_threshold:  sampled_nes.append(data)  ctr += 1  except json.JSONDecodeError as e:  print(f"Error decoding JSON: {shard_path}")  continue  # Log the number of nes sampled from the shard  logger.info(f"Shard {shard_path} sampled {ctr} nes")  return sampled_nes def st_all_shards(root_folder):  shard_paths = []  for dirpath, _, filenames in os.walk(root_folder):  for filename in filenames:  # Asming each file in the folders is a shard  shard_paths.append(os.path.join(dirpath, filename))  return shard_paths def write_shards(output_shards, sampled_nes, max_nes_per_shard=16384):  shard_index = 0  current_shard = []  for ne in sampled_nes:  current_shard.append(ne)  if len(current_shard) >= max_nes_per_shard:  # Write the current shard to a file when it reaches max_nes_per_shard  output_path = f"{output_shards}/shard_{shard_index}.json"  with open(output_path, "w") as output_file:  for item in current_shard:  output_file.write(json.dumps(item) + "\n")  shard_index += 1  current_shard = [] # Reset for the next shard  # Write any remaining nes that didn’t reach max_nes_per_shard  if current_shard:  output_path = f"{output_shards}/shard_{shard_index}.json"  with open(output_path, "w") as output_file:  for item in current_shard:  output_file.write(json.dumps(item) + "\n") if __name__ == "__main__":  # Reads jsons in input folder, filters by url, and writes the filtered jsons to the output folder  # Setup argument parser  parser = argparse.ArgumentParser(description="...")  parser.add_argument(  "--shard-folder", required=True, help="The name of the folder with the jsons."  )  parser.add_argument("--output-folder", required=True, help="The output folder.")  parser.add_argument(  "--threshold",  type=float,  default=0.7,  help="The threshold for the FastText classifier.",  )  # Parse arguments  args = parser.parse_args()  # print the args nicely formated  logger.info(f"args: {args}")  # get the file paths in the folder  input_shards = st_all_shards(args.shard_folder)  logger.info(f"Number of input shards: {len(input_shards)}")  # input_shards = input_shards[:100] # mit to 4 shards for testing  ### Process shards  # Run sampng in parallel across all input shards  ray.init(log_to_driver=True)  # ray.init(local_mode=True, include_dashboard=False, num_cpus=64, log_to_driver=True)  # ray.init(local_mode=True, log_to_driver=True)  # ray.init(local_mode=True, log_to_driver=True, include_dashboard=False, num_cpus=16)  logger.info("Ray initazed")  logger.info(f"Ray resources: {ray.cluster_resources()}")  sampled_nes = []  # meare the time it takes to process the shards  time_start = time.time()  futures = [  sample_from_shard.remote(shard, args.threshold) for shard in input_shards  ]  sampled_relts = ray.get(futures)  # Flatten all sampled nes from each shard  for relt in sampled_relts:  sampled_nes.extend(relt)  # Shuffle sampled nes to enre randomness in output shards  random.shuffle(sampled_nes)  # Write shards whenever they reach the max_nes_per_shard mit  write_shards(args.output_folder, sampled_nes)  time_end = time.time()  logger.info(f"Processing time: {time_end - time_start}") 