##!/bin/bash
#
## Set the source and destination file paths
source_file="/scratch/08002/gsmyrnis/open_lm_raw_data/rpj_data/wikipedia/wiki.jsonl"
destination_file="/scratch/07946/ss95332/data/rpj_data2/wikipedia/wiki.jsonl"

# Count the total number of lines (entries) in the file
total_lines=$(wc -l < "$source_file")

# Calculate 5% of the total line count
sample_size=$((total_lines * 5 / 100))

# Check if the calculation results in zero and adjust if necessary
if [ $sample_size -eq 0 ] && [ $total_lines -gt 0 ]; then
    sample_size=1
fi

# Randomly sample the entries and write to the destination file
shuf -n $sample_size "$source_file" > "$destination_file"

echo "Sampled $sample_size entries from $source_file to $destination_file."



# Set the source and destination file paths
#source_file="/scratch/08002/gsmyrnis/open_lm_raw_data/rpj_data/book/book.jsonl"
#destination_file="/scratch/07946/ss95332/data/rpj_data2/book/book.jsonl"
#
## Count the total number of lines (entries) in the file
#total_lines=$(wc -l < "$source_file")
#
## Calculate 5% of the total line count
#sample_size=$((total_lines * 5 / 100))
#
## Check if the calculation results in zero and adjust if necessary
#if [ $sample_size -eq 0 ] && [ $total_lines -gt 0 ]; then
#    sample_size=1
#fi
#
## Select the first 5% of the entries and write to the destination file
#head -n $sample_size "$source_file" > "$destination_file"
#
#echo "Selected first $sample_size entries from $source_file to $destination_file."
