#!/bin/bash
# shellcheck disable=SC2090,SC2086,SC2089,SC1091
# Default project path
PROJECT_PATH="$HOME/projects/repo"

# Parse command-line options
if ! OPTIONS=$(getopt -o p: --long project_path: -n 'parse-options' -- "$@"); then
	echo "Error parsing options" >&2
	exit 1
fi

eval set -- "$OPTIONS"

while true; do
	case "$1" in
	-p | --project_path)
		PROJECT_PATH="$2"
		shift 2
		;;
	--)
		shift
		break
		;;
	*)
		break
		;;
	esac
done

echo "PROJECT_PATH=$PROJECT_PATH"
#! Moving to the project folder
cd "$PROJECT_PATH" || exit
#! Preparing environment
if [[ $(hostname) == *'gpu-q'* ]]; then
	echo "Assuming the script is executing in the CSD3."
	#! Executing the environment preparation script
	#! NOTE: Must use "." to execute, "sh" doesn't work
	. "$PROJECT_PATH"/scripts/install_hpc_env.sh
else
	echo "Assuming the script is executing NOT in the CSD3."
	#! Executing the environment preparation script
	#! NOTE: Must use "." to execute, "sh" doesn't work
	. "$PROJECT_PATH"/scripts/install_env.sh
fi
#! Export the endpoint of the S3 object store, using directly the IP to avoid name resolution issues
export S3_ENDPOINT_URL='http://128.232.115.19:9000'

NUMBER_OF_TOTAL_CLIENTS=16

# DONE
# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "smollm" \
# 	--name cosmo \
# 	--splits train \
# 	--tokenizer "HuggingFaceTB/SmolLM-1.7B" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/shared" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# sleep 5

# DONE
# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "smollm" \
# 	--name "python_edu" \
# 	--splits train \
# 	--tokenizer "HuggingFaceTB/SmolLM-1.7B" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/shared" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# sleep 5

# DONE
# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "smollm" \
# 	--name "fineweb_edu_dedup" \
# 	--splits train \
# 	--tokenizer "HuggingFaceTB/SmolLM-1.7B" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/shared" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# sleep 5

# DONE
# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "fine_math" \
# 	--name fine_math_4plus \
# 	--splits train \
# 	--tokenizer "HuggingFaceTB/SmolLM-1.7B" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/shared" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# sleep 5

# DONE
# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "fine_math" \
# 	--name fine_math_3plus \
# 	--splits train \
# 	--tokenizer "HuggingFaceTB/SmolLM-1.7B" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/shared" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# sleep 5

# DONE
# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "fine_math" \
# 	--name infiwebmath_4plus \
# 	--splits train \
# 	--tokenizer "HuggingFaceTB/SmolLM-1.7B" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/shared" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# DONE
# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "fine_math" \
# 	--name infiwebmath_3plus \
# 	--splits train \
# 	--tokenizer "HuggingFaceTB/SmolLM-1.7B" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/shared" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# ------ PERSONALIZED DATASETS ------

# COSMO_TOKENIZER_PATH="~/anonymous/projects/repo/trained_tokenizers/tokenizer_cosmo_"

# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "smollm" \
# 	--name cosmo \
# 	--splits train \
# 	--tokenizer "$COSMO_TOKENIZER_PATH" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/private" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# sleep 5

# PYTHON_EDU_TOKENIZER_PATH=""

# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "smollm" \
# 	--name "python_edu" \
# 	--splits train \
# 	--tokenizer "$PYTHON_EDU_TOKENIZER_PATH" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/private" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# sleep 5

# FINEWEB_EDU_TOKENIZER_PATH="~/anonymous/projects/repo/trained_tokenizers/tokenizer_fine-web-edu-dedup_"

# #! Execute the command
# uv run python -m repo.dataset.convert_dataset_hf \
# 	--path "smollm" \
# 	--name "fineweb_edu_dedup" \
# 	--splits train \
# 	--tokenizer "$FINEWEB_EDU_TOKENIZER_PATH" \
# 	--bos_text "<|endoftext|>" \
# 	--eos_text "<|endoftext|>" \
# 	--remote_path "s3://smollm-corpus/private" \
# 	--num_clients $NUMBER_OF_TOTAL_CLIENTS

# sleep 5

MATH_TOKENIZER_PATH="~/anonymous/projects/repo/trained_tokenizers/tokenizer_infiwebmath-4plus_"

#! Execute the command
uv run python -m repo.dataset.convert_dataset_hf \
	--path "fine_math" \
	--name fine_math_4plus \
	--splits train \
	--tokenizer "$MATH_TOKENIZER_PATH" \
	--bos_text "<|endoftext|>" \
	--eos_text "<|endoftext|>" \
	--remote_path "s3://smollm-corpus/private" \
	--num_clients $NUMBER_OF_TOTAL_CLIENTS

sleep 5

#! Execute the command
uv run python -m repo.dataset.convert_dataset_hf \
	--path "fine_math" \
	--name infiwebmath_4plus \
	--splits train \
	--tokenizer "$MATH_TOKENIZER_PATH" \
	--bos_text "<|endoftext|>" \
	--eos_text "<|endoftext|>" \
	--remote_path "s3://smollm-corpus/private" \
	--num_clients $NUMBER_OF_TOTAL_CLIENTS

#! Remove the positional arguments
eval set --
