#!/bin/bash

# Replace these with your actual directory paths
source_directory_path="/scratch/08002/gsmyrnis/open_lm_raw_data/rpj_data/common_crawl/2023-06"
destination_directory_path="/scratch/07946/ss95332/data/rpj_data2/common_crawl/2023-06"

# Count the number of .jsonl files
total_files=$(find "$source_directory_path" -type f -name '*.jsonl.zst' | wc -l)

# Calculate 5% of the total file count
num_files_to_select=$((total_files * 5 / 100))

# Check if the calculation results in zero and adjust if necessary
if [ $num_files_to_select -eq 0 ] && [ $total_files -gt 0 ]; then
    num_files_to_select=1
fi

# Randomly select files and copy them to the destination directory
find "$source_directory_path" -type f -name '*.jsonl.zst' | shuf -n $num_files_to_select | xargs -I '{}' cp '{}' "$destination_directory_path"

echo "Copied $num_files_to_select files from $source_directory_path to $destination_directory_path."
