#!/bin/bash

# Replace these with your actual directory paths
#source_directory_path="/scratch/08002/gsmyrnis/open_lm_raw_data/rpj_data/common_crawl/2023-06"
#destination_directory_path="/scratch/07946/ss95332/data/rpj_data2/common_crawl/2023-06"
#
## Count the number of .jsonl files
#total_files=$(find "$source_directory_path" -type f -name '*.jsonl.zst' | wc -l)
#
## Calculate 5% of the total file count
#num_files_to_select=$((total_files * 5 / 100))
#
## Check if the calculation results in zero and adjust if necessary
#if [ $num_files_to_select -eq 0 ] && [ $total_files -gt 0 ]; then
#    num_files_to_select=1
#fi
#
## Randomly select files and copy them to the destination directory
#find "$source_directory_path" -type f -name '*.jsonl' | shuf -n $num_files_to_select | xargs -I '{}' cp '{}' "$destination_directory_path"
#
#echo "Copied $num_files_to_select files from $source_directory_path to $destination_directory_path."


# Replace these with your actual directory paths
source_directory_path="/scratch/08002/gsmyrnis/open_lm_raw_data/rpj_data/common_crawl/2023-06"
destination_directory_path="/scratch/07946/ss95332/data/rpj_data50B_2/common_crawl/2023-06"

# Count the number of .jsonl.zst files (Corrected file extension to match the counting)
total_files=$(find "$source_directory_path" -type f -name '*.jsonl.zst' | wc -l)

# Calculate 5% of the total file count
num_files_to_select=$((total_files * 5 / 100))

# Check if the calculation results in zero and adjust if necessary
if [ $num_files_to_select -eq 0 ] && [ $total_files -gt 0 ]; then
    num_files_to_select=1
fi

# Randomly select files and copy them to the destination directory
# Corrected the find command to search for '.jsonl.zst' files instead of '.jsonl'
find "$source_directory_path" -type f -name '*.jsonl.zst' | shuf -n $num_files_to_select | xargs -I '{}' cp '{}' "$destination_directory_path"

echo "Copied $num_files_to_select files from $source_directory_path to $destination_directory_path."
