#!/usr/bin/env bash
# ──────────────────────────────────────────────────────────────────
#  run_sweep_opt_iid_135M.sh – wrapper that:
#    • extracts sweep parameters from the SWEEP_PARAMETERS environment
#      variable
#    • builds a big EXTERNAL_CONFIGS string for Hydra / repo
#    • prepares ports, paths, Ray & Flower helpers
#    • finally calls repo_base_independent.sh
# ──────────────────────────────────────────────────────────────────

echo "Running run_sweep_opt_iid_135M.sh"

# ------------------------------------------------------------------
# 0. Obtain the sweep parameters from the environment
# ------------------------------------------------------------------

export SWEEP_PARAMETERS

echo "SWEEP_NAME: $SWEEP_NAME"
echo "SWEEP_PARAMETERS: $SWEEP_PARAMETERS"

# ------------------------------------------------------------------
# 1.  General run identifiers & paths
# ------------------------------------------------------------------
# export RUN_UUID="135M-$SWEEP_NAME-$DATETIME" # unique run id
export MODEL_TYPE="SMOLLM_135M"              # LLM flavour

export PROJECT_PATH="$HOME/projects/repo" # repo root
export SAVE_PATH="$PROJECT_PATH/runs/$RUN_UUID"

# ------------------------------------------------------------------
# 2.  Misc runtime toggles
# ------------------------------------------------------------------
USE_RAY=false # toggle Ray backend
USE_SHM=true  #   # toggle POSIX shm transport
USE_S3=false  # optional S3 comm transport

# dataset cache location (shared across nodes)
# export DATASET_CACHE_DIR="$PROJECT_PATH/dataset_cache"
export DATASET_CACHE_DIR="~/datasets/repo/dataset_cache"
mkdir -p "$DATASET_CACHE_DIR"

EXTERNAL_CONFIGS="$SWEEP_PARAMETERS"

# ------------------------------------------------------------------
# 3.  Append a big bundle of default overrides
#     (each line wins over configs defined earlier)
# ------------------------------------------------------------------
EXTERNAL_CONFIGS+=" repo.checkpoint=true"
EXTERNAL_CONFIGS+=" llm_config.eval_subset_num_batches=0"
EXTERNAL_CONFIGS+=" llm_config.precision=amp_bf16"
EXTERNAL_CONFIGS+=" ~llm_config.fsdp_config"
EXTERNAL_CONFIGS+=" fl.use_unigram_metrics=true"
EXTERNAL_CONFIGS+=" fl.allow_unigram_metrics_failures=false"
EXTERNAL_CONFIGS+=" ++llm_config.callbacks.noise_scale_monitor={}"
EXTERNAL_CONFIGS+=" llm_config.eval_first=false"
EXTERNAL_CONFIGS+=" llm_config.eval_interval=999999999999ba"
EXTERNAL_CONFIGS+=" llm_config.save_folder=$SAVE_PATH"
EXTERNAL_CONFIGS+=" llm_config.save_num_checkpoints_to_keep=5"
EXTERNAL_CONFIGS+=" +llm_config.optimizer.report_curvature=true"
EXTERNAL_CONFIGS+=" +llm_config.callbacks.optimizer_monitor.report_curvature=true"
EXTERNAL_CONFIGS+=" llm_config.save_interval=999999ba"
EXTERNAL_CONFIGS+=" fl.eval_period=null"
EXTERNAL_CONFIGS+=" dataset=smollm-corpus-shared"
EXTERNAL_CONFIGS+=" dataset.train.root_local=$DATASET_CACHE_DIR/smollm-corpus-shared"
EXTERNAL_CONFIGS+=" dataset.val.root_local=$DATASET_CACHE_DIR/smollm-corpus-shared"
EXTERNAL_CONFIGS+=" dataset/streams@dataset.train.streams=smollm_corpus_4_clients_iid"
EXTERNAL_CONFIGS+=" dataset/streams@dataset.val.streams=smollm_corpus_4_clients_iid"
EXTERNAL_CONFIGS+=" llm_config.eval_loader=null"

# ------------------------------------------------------------------
# 4.  repo communication-stack flags (Ray / SHM / S3)
# ------------------------------------------------------------------
if [ "$USE_RAY" = true ]; then EXTERNAL_CONFIGS+=" repo.comm_stack.ray=true"; fi
if [ "$USE_SHM" = true ]; then EXTERNAL_CONFIGS+=" repo.comm_stack.shm=true"; fi
if [ "$USE_S3" = true ]; then EXTERNAL_CONFIGS+=" repo.comm_stack.s3=true"; fi

# Send metrics to Weights & Biases
EXTERNAL_CONFIGS+=" use_wandb=true"

export EXTERNAL_CONFIGS # make it visible to child processes

# ------------------------------------------------------------------
# 5.  Dynamically pick ports for Flower & Ray
#     (avoids clashes when multiple jobs launch on the same node)
# ------------------------------------------------------------------
FLOWER_SUPERLINK_IP=$(hostname -I | awk '{print $1}')

# Ask Python helper to find free ports (uv = uvicorn / uvloop runner)
DRIVER_API_PORT=$(uv run python -c "
from repo.port_utils import get_free_tcp_port
print(get_free_tcp_port())")
DRIVER_API_ADDRESS=${DRIVER_API_ADDRESS:-"$FLOWER_SUPERLINK_IP:$DRIVER_API_PORT"}

FLEET_API_PORT=$(uv run python -c "
from repo.port_utils import get_free_tcp_port
print(get_free_tcp_port(['${DRIVER_API_PORT}']))")
FLEET_API_ADDRESS=${FLEET_API_ADDRESS:-"$FLOWER_SUPERLINK_IP:$FLEET_API_PORT"}

# Ray
RAY_PORT=$(uv run python -c "
from repo.port_utils import get_free_tcp_port
print(get_free_tcp_port(['${DRIVER_API_PORT}','${FLEET_API_PORT}']))")
RAY_NODE_IP=$(hostname -I | awk '{print $1}')
RAY_ADDRESS="$RAY_NODE_IP:$RAY_PORT"
RAY_TEMP_DIR="$PROJECT_PATH/ray"

export DRIVER_API_ADDRESS FLEET_API_ADDRESS
export RAY_ADDRESS RAY_PORT RAY_NODE_IP

# ------------------------------------------------------------------
# 6.  Optionally launch Ray head node on this machine
# ------------------------------------------------------------------
if [ "$USE_RAY" = true ]; then
	uv run ray start --head --port="$RAY_PORT" --temp-dir "$RAY_TEMP_DIR" &
fi

# ------------------------------------------------------------------
# 7.  Finally invoke the main repo bootstrap script
# ------------------------------------------------------------------
bash "$HOME/projects/repo/scripts/repo_base_independent.sh" \
	-p "$PROJECT_PATH" \
	"$MODEL_TYPE"

# bash "$HOME/projects/repo/scripts/repo_base_bi_independent.sh" \
# 	-p "$PROJECT_PATH" \
# 	"$MODEL_TYPE"
