#!/bin/bash
# ------------------------------------------------------------------
# 1.  Setting the fixed parameters for the sweep
# ------------------------------------------------------------------

while pgrep -f "composer" >/dev/null; do
	echo "composer is still running. Waiting..."
	sleep 60
done
while pgrep -f "flower-superlink" >/dev/null; do
	echo "flower-superlink is still running. Waiting..."
	sleep 60
done

rm -rf /dev/shm/*

SWEEP_NAME="IID"

FIXED_PARAMETERS=""
FIXED_PARAMETERS+=" repo.n_nodes=4"
FIXED_PARAMETERS+=" fl.n_total_clients=4"
FIXED_PARAMETERS+=" fl.n_clients_per_round=4"
FIXED_PARAMETERS+=" fl.strategy_name=FEDAVG"
FIXED_PARAMETERS+=" fl.strategy_kwargs.server_learning_rate=1.0"
FIXED_PARAMETERS+=" llm_config.global_train_batch_size=256"
FIXED_PARAMETERS+=" llm_config.optimizer.name=adopt"
FIXED_PARAMETERS+=" llm_config.optimizer.lr=0.0021"
FIXED_PARAMETERS+=" llm_config.optimizer.betas=[0.95,0.9999]"
FIXED_PARAMETERS+=" llm_config.scheduler.schedulers.lr.t_warmup=512ba"
FIXED_PARAMETERS+=" llm_config.scheduler.schedulers.lr.t_cooldown=256ba"
FIXED_PARAMETERS+=" llm_config.scheduler.schedulers.lr.t_max=3072ba"
FIXED_PARAMETERS+=" llm_config.device_train_microbatch_size=auto"
FIXED_PARAMETERS+=" llm_config.max_duration=3072ba"

DATETIME=$(date '+%Y%m%d_%H%M%S')   

TUPLES=(
	# Complete ADOPT parameters base 16
	"False 16 192 2 1 1 adopt 0.0021 [0.95,0.9999] -1 135M_IID_$(date '+%Y%m%d_%H%M%S')"
	"False 16 192 16 1 1 adopt 0.0021 [0.95,0.9999] -1 135M_IID_$(date '+%Y%m%d_%H%M%S')"
	"False 16 192 64 1 1 adopt 0.0021 [0.95,0.9999] -1 135M_IID_$(date '+%Y%m%d_%H%M%S')"
	"False 16 192 192 1 1 adopt 0.0021 [0.95,0.9999] -1 135M_IID_$(date '+%Y%m%d_%H%M%S')"

	# Decoupled adamw parameters base 16
	"False 16 192 2 1 1 decoupled_adamw 0.003 [0.95,0.95] -1 135M_IID_ADAM_$(date '+%Y%m%d_%H%M%S')"
	"False 16 192 16 1 1 decoupled_adamw 0.003 [0.95,0.95] -1 135M_IID_ADAM_$(date '+%Y%m%d_%H%M%S')"
	"False 16 192 64 1 1 decoupled_adamw 0.003 [0.95,0.95] -1 135M_IID_ADAM_$(date '+%Y%m%d_%H%M%S')"
	"False 16 192 192 1 1 decoupled_adamw 0.003 [0.95,0.95] -1 135M_IID_ADAM_$(date '+%Y%m%d_%H%M%S')"
)

 

for tuple in "${TUPLES[@]}"; do
	read -r reset n_steps n_rounds parameters exp_avg exp_avg_sq \
		opt_name opt_lr opt_betas resume_round run_uuid <<<"$tuple"

	SWEEP_NAME="IID"

	run_params="$FIXED_PARAMETERS"
	run_params+=" fl.reset_optimizer=$reset"
	run_params+=" fl.n_local_steps=$n_steps"
	run_params+=" fl.n_rounds=$n_rounds"
	run_params+=" fl.parameter_scheduler_kwargs.PARAMETERS=$parameters"
	run_params+=" +fl.parameter_scheduler_kwargs.EXP_AVG=$exp_avg"
	run_params+=" +fl.parameter_scheduler_kwargs.EXP_AVG_SQ=$exp_avg_sq"
	run_params+=" llm_config.optimizer.name=$opt_name"
	run_params+=" llm_config.optimizer.lr=$opt_lr"
	run_params+=" llm_config.optimizer.betas=$opt_betas"
	run_params+=" repo.resume_round=$resume_round"

	export RUN_UUID="${parameters}_${exp_avg}_${exp_avg_sq}_$run_uuid"

	export SWEEP_PARAMETERS="$run_params"
	export SWEEP_NAME="${SWEEP_NAME}-${parameters}-${exp_avg}-${exp_avg_sq}"

	echo "Running with: $tuple"
	bash "$HOME/projects/repo/scripts/neurips/run_sweep_opt_iid_135M.sh"

	pkill -f -9 repo/.venv && rm -rf /dev/shm/* && rm -rf "$HOME"/projects/repo/ray*
	pkill -9 -f python

	sleep 5
done
