#!/usr/bin/env bash
# bulk_merge.sh  ── merge many HF image datasets + JSONL annotations
#
# Usage examples
# --------------
#   ./bulk_merge.sh datasets.txt
#   ./bulk_merge.sh datasets.txt --repo-id MyUser/merged --output_dir ./merged_arrow
#
# • datasets.txt should contain one dataset name per line (e.g. username/cats).
# • Blank lines and lines starting with “#” are ignored.
# • Any extra flags after the list file are forwarded to merge_annotations.py,
#   so you can override --repo-id, --output_dir, etc. per run.

set -euo pipefail

if [[ $# -lt 1 ]]; then
  echo "Usage: $0 DATASET_LIST.txt [extra flags for merge_annotations.py]" >&2
  exit 1
fi

LIST_FILE=$1        # text file with dataset names
shift               # pass the remaining cli args straight to the Python script

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

while IFS= read -r DATASET || [[ -n "$DATASET" ]]; do
  # Skip empty lines or comments
  [[ -z "$DATASET" || "$DATASET" =~ ^# ]] && continue

  echo -e "\n🔄  Processing: $DATASET"
  python "$SCRIPT_DIR/merge_annotations.py" "$DATASET" "$@"
done < "$LIST_FILE"

echo -e "\n✅  All datasets processed."
