#wget 'https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt'
#
#dirname=/scratch/07946/ss95332/data/rpj_data
#
#while read line; do
#    dload_loc=${line#https://data.together.xyz/redpajama-data-1T/v1.0.0/}
#    mkdir -p $(dirname $dload_loc)
#    wget "$line" -O "$dload_loc"
#done < urls.txt
#
#
#

# Directory where the data will be stored
#dirname=/scratch/07946/ss95332/data/rpj_data

# Directory containing the text files with URLs
#url_dir=/scratch/07946/ss95332/data/RedPajama-Data-1T/urls_new
#
## Loop through each text file in the url directory
#for file in "$url_dir"/*.txt; do
#    echo "Processing $file..."
#
#    # Count the total number of lines in the file
#    total_lines=$(wc -l < "$file")
#
#    # Calculate 2% of the total lines, rounding up
#    num_lines_to_select=$(( (total_lines + 49) / 50 ))
#
#    # Select 2% of lines randomly and store them in a temporary file
#    shuf -n "$num_lines_to_select" "$file" > temp_selected_urls.txt
#
#    # Download each selected line using the format you provided
#    while read line; do
#        dload_loc="${dirname}/${line#https://data.together.xyz/redpajama-data-1T/v1.0.0/}"
#        mkdir -p "$(dirname "$dload_loc")"
#        wget "$line" -O "$dload_loc"
#    done < temp_selected_urls.txt
#
#    # Clean up temporary file
#    rm temp_selected_urls.txt
#done

# Directory where the data will be stored
dirname=/scratch/07946/ss95332/data/rpj_data

# Directory containing the text files with URLs
url_dir=/scratch/07946/ss95332/data/RedPajama-Data-1T/urls_new

# Loop through each text file in the url directory
for file in "$url_dir"/*.txt; do
    echo "Processing $file..."

    # Download each line (URL) from the file
    while read line; do
        dload_loc="${dirname}/${line#https://data.together.xyz/redpajama-data-1T/v1.0.0/}"
        mkdir -p $(dirname $dload_loc)
        wget "$line" -O "$dload_loc"
    done < "$file"
done
