#!/bin/bash
# shellcheck disable=SC2090,SC2086,SC2089,SC1091
# Default project path
PROJECT_PATH="$HOME/projects/repo"

# Parse command-line options
if ! OPTIONS=$(getopt -o p: --long project_path: -n 'parse-options' -- "$@"); then
	echo "repo_base_independent.sh: Error parsing options" >&2
	exit 1
fi

eval set -- "$OPTIONS"

while true; do
	case "$1" in
	-p | --project_path)
		PROJECT_PATH="$2"
		shift 2
		;;
	--)
		shift
		break
		;;
	*)
		break
		;;
	esac
done
MODEL_SIZE="$1"
echo "repo_base_independent.sh: MODEL_SIZE=$MODEL_SIZE"

#! Check if at least one arguments are passed
if [[ $# -lt 1 ]]; then
	echo "repo_base_independent.sh: Illegal number of parameters."
	echo "Usage: repo_base_independent.sh <llm_model_config> (-p/--project_path <project_path>)"
	exit 1
fi
echo "repo_base_independent.sh: PROJECT_PATH=$PROJECT_PATH"

#! Moving to the project folder
cd "$PROJECT_PATH" || exit
#! Preparing environment
if [[ $(hostname) == *'gpu-q'* ]]; then
	echo "Assuming the script is executing in the CSD3."
	#! Executing the environment preparation script
	#! NOTE: Must use "." to execute, "sh" doesn't work
	. "$PROJECT_PATH"/scripts/install_hpc_env.sh -p "$PROJECT_PATH"
else
	echo "Assuming the script is executing NOT in the CSD3."
	#! Executing the environment preparation script
	#! NOTE: Must use "." to execute, "sh" doesn't work
	. "$PROJECT_PATH"/scripts/install_env.sh -p "$PROJECT_PATH"
fi
#! Set `LLM_CONFIG` environment variable
. "$PROJECT_PATH"/scripts/set_llm_config.sh -p "$PROJECT_PATH" "$MODEL_SIZE"

#! Export the endpoint of the S3 object store
# export S3_ENDPOINT_URL='http://anonymous.anonymous.:9000'
export S3_ENDPOINT_URL='http://anonymous.anonymous.:9000'

#! Saving path
DATETIME=$(date '+%Y%m%d_%H%M%S')
#! If RUN_UUID hasn't been set, set it to the default value
if [ -z "$RUN_UUID" ]; then
	export RUN_UUID="repo-13B-$DATETIME"
fi

#! If SAVE_PATH hasn't been set, set it to the default value
if [ -z "$SAVE_PATH" ]; then
	export SAVE_PATH="$PROJECT_PATH/$RUN_UUID"
	mkdir -p "$SAVE_PATH"
fi

#! If repo_SAVE_PATH hasn't been set, set it to the default value
if [ -z "$repo_SAVE_PATH" ]; then
	export repo_SAVE_PATH="$PROJECT_PATH/runs/$RUN_UUID"
fi
mkdir -p "$repo_SAVE_PATH"
mkdir -p "$repo_SAVE_PATH/$DATETIME"

#! Getting visible GPUs
N_GPUS=$(uv run python -c 'import torch; print(torch.cuda.device_count())')
if [ "$N_GPUS" -eq 0 ]; then
	echo "No GPUs found. Exiting."
	CUDA_VISIBLE_DEVICES=""
else
	CUDA_VISIBLE_DEVICES=$(seq -s, 0 $((N_GPUS - 1)))
fi
echo "repo_base_independent.sh: CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"

#! Setting the run UUID
LLM_CONFIG="$LLM_CONFIG run_uuid=$RUN_UUID"

echo "repo_base_independent.sh: LLM_CONFIG=$LLM_CONFIG"
echo "repo_base_independent.sh: EXTERNAL_CONFIGS=$EXTERNAL_CONFIGS"

#! Run Hydra resolver
HYDRA_FULL_ERROR=1 uv run python -m repo.hydra_resolver $LLM_CONFIG $EXTERNAL_CONFIGS hydra/job_logging=none hydra/hydra_logging=none 2>&1 | tee "$repo_SAVE_PATH/$DATETIME"/hydra_resolver.log

#! Set the Flower driver and fleet API addresses if they haven't been set
DRIVER_API_ADDRESS=${DRIVER_API_ADDRESS:-"[::]:54752"}
FLEET_API_ADDRESS=${FLEET_API_ADDRESS:-"[::]:54753"}

#! Start a Superlink
# GRPC_VERBOSITY=debug
uv run flower-superlink --insecure --driver-api-address "${DRIVER_API_ADDRESS}" --fleet-api-address "${FLEET_API_ADDRESS}" 2>&1 | tee "$repo_SAVE_PATH"/"$DATETIME"/superlink.log &
SUPERLINK_PID=$!

sleep 5

#! Launch ServerWithrepo as a ServerApp
# GRPC_VERBOSITY=debug
uv run flower-server-app repo.server_app:app --insecure --superlink "${DRIVER_API_ADDRESS}" 2>&1 | tee "$repo_SAVE_PATH"/"$DATETIME"/server.log &
#! Keep the pid of the ServerApp
SERVERAPP_PID=$!

sleep 5

#! Make the CUDA_VISIBLE_DEVICES into a list of int
CUDA_VISIBLE_DEVICES=$(echo $CUDA_VISIBLE_DEVICES | tr "," "\n")
#! Launch one instance of NodeManager (as a SuperNode - ClientApp) for each CUDA_VISIBLE_DEVICES
for CUDA_VISIBLE_DEVICE in $CUDA_VISIBLE_DEVICES; do
	#! NOTE: Adding `NCCL_BLOCKING_WAIT=1` breaks the optimizer's checkpointing. We don't know why yet.
	# NCCL_DEBUG=INFO NCCL_NVB_DISABLE=1 NCCL_NVLS_ENABLE=0 # For running on Lambda Labs faulty machine
	# GRPC_VERBOSITY=debug
	CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICE CUDA_LAUNCH_BLOCKING=1 uv run flower-client-app repo.client_app:app --insecure --superlink "${FLEET_API_ADDRESS}" --persist-client 2>&1 | tee "$repo_SAVE_PATH"/"$DATETIME"/node_manager_$CUDA_VISIBLE_DEVICE.log &
	#! Append the pid of the ClientApp
	CLIENTAPP_PID="$CLIENTAPP_PID $!"
done

# Enable CTRL+C to stop all background processes
trap 'kill $CLIENTAPP_PID $SUPERLINK_PID $SERVERAPP_PID' SIGINT SIGTERM

#! Wait for the ServerApp to finish
wait $SERVERAPP_PID
#! Kill the ClientApp and Superlink
kill $CLIENTAPP_PID
kill $SUPERLINK_PID
