#!/usr/bin/env bash

while pgrep -f "composer" >/dev/null; do
	echo "composer is still running. Waiting..."
	sleep 60
done
while pgrep -f "flower-superlink" >/dev/null; do
	echo "flower-superlink is still running. Waiting..."
	sleep 60
done

rm -rf /dev/shm/*

FIXED_PARAMETERS=""
FIXED_PARAMETERS+=" fl.n_total_clients=4"
FIXED_PARAMETERS+=" fl.n_clients_per_round=4"

FIXED_PARAMETERS+=" fl.strategy_name=FEDAVG"
FIXED_PARAMETERS+=" fl.strategy_kwargs.server_learning_rate=1.0"

FIXED_PARAMETERS+=" +fl.parameter_scheduler_kwargs.EXP_AVG=3"
FIXED_PARAMETERS+=" +fl.parameter_scheduler_kwargs.EXP_AVG_SQ=20"

FIXED_PARAMETERS+=" llm_config.global_train_batch_size=256"
FIXED_PARAMETERS+=" llm_config.device_train_microbatch_size=8"

run_params="$FIXED_PARAMETERS"
run_params+=" fl.reset_optimizer=false"
run_params+=" fl.n_local_steps=256"
run_params+=" fl.n_rounds=80"

# Export the parameters for this run
export SWEEP_PARAMETERS="$run_params"

export SWEEP_PARAMETERS

echo "SWEEP_NAME: $SWEEP_NAME"
echo "SWEEP_PARAMETERS: $SWEEP_PARAMETERS"

# ------------------------------------------------------------------
# 1.  General run identifiers & paths
# ------------------------------------------------------------------
# DATETIME=$(date '+%Y%m%d_%H%M%S')            # wall-clock timestamp
export RUN_UUID="des-1B-IID-20250509_002238" # unique run id
export MODEL_TYPE="SMOLLM_1B"                # LLM flavour

export PROJECT_PATH="$HOME/projects/repo" # repo root
export SAVE_PATH="$PROJECT_PATH/runs/$RUN_UUID"

# ------------------------------------------------------------------
# 2.  Misc runtime toggles
# ------------------------------------------------------------------
COMM_BATCHES=1 # batches per repo comm round
USE_RAY=false  # toggle Ray backend
USE_SHM=true   # toggle POSIX shm transport
USE_S3=false   # optional S3 comm transport

# dataset cache location (shared across nodes)
# export DATASET_CACHE_DIR="$PROJECT_PATH/dataset_cache"
export DATASET_CACHE_DIR="~/datasets/repo/dataset_cache"
mkdir -p "$DATASET_CACHE_DIR"

EXTERNAL_CONFIGS="$SWEEP_PARAMETERS"

# ------------------------------------------------------------------
# 3.  Append a big bundle of default overrides
#     (each line wins over configs defined earlier)
# ------------------------------------------------------------------
EXTERNAL_CONFIGS+=" repo.resume_round=-1"
EXTERNAL_CONFIGS+=" repo.checkpoint=true"
EXTERNAL_CONFIGS+=" llm_config.eval_subset_num_batches=0"
EXTERNAL_CONFIGS+=" llm_config.precision=amp_bf16"
EXTERNAL_CONFIGS+=" ~llm_config.fsdp_config"
EXTERNAL_CONFIGS+=" fl.use_unigram_metrics=true"
EXTERNAL_CONFIGS+=" fl.allow_unigram_metrics_failures=false"
EXTERNAL_CONFIGS+=" ++llm_config.callbacks.noise_scale_monitor={}"
EXTERNAL_CONFIGS+=" llm_config.eval_first=false"
EXTERNAL_CONFIGS+=" llm_config.eval_interval=999999999999ba"
EXTERNAL_CONFIGS+=" llm_config.save_folder=$SAVE_PATH"
EXTERNAL_CONFIGS+=" llm_config.save_num_checkpoints_to_keep=-1"
EXTERNAL_CONFIGS+=" +llm_config.optimizer.report_curvature=true"
EXTERNAL_CONFIGS+=" +llm_config.callbacks.optimizer_monitor.report_curvature=true"
EXTERNAL_CONFIGS+=" llm_config.save_interval=999999ba"
EXTERNAL_CONFIGS+=" fl.parameter_scheduler_kwargs.PARAMETERS=1"
EXTERNAL_CONFIGS+=" fl.eval_period=null"
EXTERNAL_CONFIGS+=" dataset=smollm-corpus-shared"
EXTERNAL_CONFIGS+=" dataset.train.root_local=$DATASET_CACHE_DIR/smollm-corpus-shared"
EXTERNAL_CONFIGS+=" dataset.val.root_local=$DATASET_CACHE_DIR/smollm-corpus-shared"
EXTERNAL_CONFIGS+=" dataset/streams@dataset.train.streams=smollm_corpus_4_clients_proportional"
EXTERNAL_CONFIGS+=" dataset/streams@dataset.val.streams=smollm_corpus_4_clients_proportional"
EXTERNAL_CONFIGS+=" llm_config.eval_loader=null"

# ------------------------------------------------------------------
# 4.  repo communication-stack flags (Ray / SHM / S3)
# ------------------------------------------------------------------
EXTERNAL_CONFIGS+=" repo.n_nodes=1"
if [ "$USE_RAY" = true ]; then EXTERNAL_CONFIGS+=" repo.comm_stack.ray=true"; fi
if [ "$USE_SHM" = true ]; then EXTERNAL_CONFIGS+=" repo.comm_stack.shm=true"; fi
if [ "$USE_S3" = true ]; then EXTERNAL_CONFIGS+=" repo.comm_stack.s3=true"; fi
EXTERNAL_CONFIGS+=" repo.comm_stack.n_batches=$COMM_BATCHES"

# Send metrics to Weights & Biases
EXTERNAL_CONFIGS+=" use_wandb=true"

export EXTERNAL_CONFIGS # make it visible to child processes

# ------------------------------------------------------------------
# 5.  Dynamically pick ports for Flower & Ray
#     (avoids clashes when multiple jobs launch on the same node)
# ------------------------------------------------------------------
FLOWER_SUPERLINK_IP=$(hostname -I | awk '{print $1}')

# Ask Python helper to find free ports
DRIVER_API_PORT=$(uv run python -c "
from repo.port_utils import get_free_tcp_port
print(get_free_tcp_port())")
DRIVER_API_ADDRESS=${DRIVER_API_ADDRESS:-"$FLOWER_SUPERLINK_IP:$DRIVER_API_PORT"}

FLEET_API_PORT=$(uv run python -c "
from repo.port_utils import get_free_tcp_port
print(get_free_tcp_port(['${DRIVER_API_PORT}']))")
FLEET_API_ADDRESS=${FLEET_API_ADDRESS:-"$FLOWER_SUPERLINK_IP:$FLEET_API_PORT"}

# Ray
RAY_PORT=$(uv run python -c "
from repo.port_utils import get_free_tcp_port
print(get_free_tcp_port(['${DRIVER_API_PORT}','${FLEET_API_PORT}']))")
RAY_NODE_IP=$(hostname -I | awk '{print $1}')
RAY_ADDRESS="$RAY_NODE_IP:$RAY_PORT"
RAY_TEMP_DIR="$PROJECT_PATH/ray"

export DRIVER_API_ADDRESS FLEET_API_ADDRESS
export RAY_ADDRESS RAY_PORT RAY_NODE_IP

# ------------------------------------------------------------------
# 6.  Optionally launch Ray head node on this machine
# ------------------------------------------------------------------
if [ "$USE_RAY" = true ]; then
	uv run ray start --head --port="$RAY_PORT" --temp-dir "$RAY_TEMP_DIR" &
fi

# ------------------------------------------------------------------
# 7.  Finally invoke the main repo bootstrap script
# ------------------------------------------------------------------
bash "$HOME/projects/repo/scripts/repo_base_independent.sh" \
	-p "$PROJECT_PATH" \
	"$MODEL_TYPE"

# bash "$HOME/projects/repo/scripts/repo_base_bi_independent.sh" \
# 	-p "$PROJECT_PATH" \
# 	"$MODEL_TYPE"
