#!/bin/bash
# Set DATA_PATH before running. Example: export DATA_PATH=/path/to/c4-train.00025-of-01024.json.gz
# PROJECT_ROOT defaults to repo root (parent of grouter_ep_optimizer).
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_ROOT="${PROJECT_ROOT:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
cd "$PROJECT_ROOT"
PYTHONPATH=${PROJECT_ROOT}:${PROJECT_ROOT}/Megatron-LM \
python Megatron-LM/tools/validate_predispatch.py \
  --output_prefix ${OUTPUT_PREFIX:-./grouter_predispatch/tf} \
  --key text \
  --data_path ${DATA_PATH} \
  --grouter_ckpt Megatron-LM/utils_grouter/tf_v2_1_50000.pth \
  --tokenizer_path model_home/deepseek-v2-lite \
  --grouter_config Megatron-LM/utils_grouter/GrtConfig_tf.json \
  --sample_doc_indices 0,4,17,231,723,4812,156239 \
  --check_coverage \
  --validate_tokenized \
  --append_eod