#!/bin/bash
# Script to run the op_eval_server container with NPU device mapping
# Adjust NPU devices as needed

IMAGE_NAME="op_eval_server:latest"
MAX_OPS_PER_NPU=1
NPU_VISIBLE_DEVICES=8
CONTAINER_NAME="op_eval_server"
HOST_ARTIFACTS_DIR="${OP_EVAL_ARTIFACTS_HOST_DIR:-/tmp/op_eval_artifacts}"
CONTAINER_ARTIFACTS_DIR="/tmp/op_eval_artifacts"
OP_EVAL_PKG_DIR="${OP_EVAL_PKG_DIR:-/usr/local/python3.11.13/lib/python3.11/site-packages/op_eval}"

# HuggingFace model cache (pre-downloaded models for offline use)
HOST_HF_CACHE="${OP_EVAL_HF_CACHE:-/mnt/cache/huggingface_models}"
CONTAINER_HF_CACHE="/root/.cache/huggingface"

echo "Stopping any existing container named $CONTAINER_NAME..."
docker stop $CONTAINER_NAME 2>/dev/null || true
docker rm $CONTAINER_NAME 2>/dev/null || true

echo "Starting $CONTAINER_NAME..."
echo "Resetting artifacts dir: $HOST_ARTIFACTS_DIR"
rm -rf "$HOST_ARTIFACTS_DIR"
mkdir -p "$HOST_ARTIFACTS_DIR"
mkdir -p "$HOST_ARTIFACTS_DIR/_tmp" "$HOST_ARTIFACTS_DIR/_home"
mkdir -p "$HOST_ARTIFACTS_DIR/_pkg_extra-info" "$HOST_ARTIFACTS_DIR/_pkg_kernel_meta"
echo "Mounting op_eval package dirs:"
echo "  $OP_EVAL_PKG_DIR/extra-info -> $HOST_ARTIFACTS_DIR/_pkg_extra-info"
echo "  $OP_EVAL_PKG_DIR/kernel_meta -> $HOST_ARTIFACTS_DIR/_pkg_kernel_meta"

# Check if HuggingFace cache exists
if [ -d "$HOST_HF_CACHE" ]; then
  echo "Mounting HuggingFace cache: $HOST_HF_CACHE -> $CONTAINER_HF_CACHE"
  HF_MOUNT="-v $(realpath $HOST_HF_CACHE):$CONTAINER_HF_CACHE"
  # Modern HuggingFace env vars (HF_HOME + HF_HUB_CACHE)
  # HF_HUB_CACHE points directly to where models are stored
  # HF_HUB_OFFLINE=1 enables offline mode (no network requests)
  HF_ENV="-e HF_HOME=$CONTAINER_HF_CACHE -e HF_HUB_CACHE=$CONTAINER_HF_CACHE -e HF_HUB_OFFLINE=1"
else
  echo "[WARN] HuggingFace cache not found at $HOST_HF_CACHE. Models will be downloaded on demand."
  echo "[INFO] Run 'python scripts/download_hf_models.py' to pre-download models."
  HF_MOUNT=""
  HF_ENV=""
fi

docker run -d \
  --name "$CONTAINER_NAME" \
  --restart unless-stopped \
  --net=host \
  --security-opt seccomp=unconfined \
  --device /dev/davinci0 \
  --device /dev/davinci1 \
  --device /dev/davinci2 \
  --device /dev/davinci3 \
  --device /dev/davinci4 \
  --device /dev/davinci5 \
  --device /dev/davinci6 \
  --device /dev/davinci7 \
  --device /dev/davinci_manager \
  --device /dev/devmm_svm \
  --device /dev/hisi_hdc \
  -v /usr/local/dcmi:/usr/local/dcmi \
  -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
  -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
  -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
  -v /etc/ascend_install.info:/etc/ascend_install.info \
  -v "$HOST_ARTIFACTS_DIR":"$CONTAINER_ARTIFACTS_DIR" \
  -v "$HOST_ARTIFACTS_DIR/_pkg_extra-info":"$OP_EVAL_PKG_DIR/extra-info" \
  -v "$HOST_ARTIFACTS_DIR/_pkg_kernel_meta":"$OP_EVAL_PKG_DIR/kernel_meta" \
  $HF_MOUNT \
  -e OP_EVAL_SERVER_TMP="$CONTAINER_ARTIFACTS_DIR" \
  -e HOME="$CONTAINER_ARTIFACTS_DIR/_home" \
  -e TMPDIR="$CONTAINER_ARTIFACTS_DIR/_tmp" \
  -e TMP="$CONTAINER_ARTIFACTS_DIR/_tmp" \
  -e TEMP="$CONTAINER_ARTIFACTS_DIR/_tmp" \
  -e MAX_OPS_PER_NPU="$MAX_OPS_PER_NPU" \
  -e NPU_VISIBLE_DEVICES="$NPU_VISIBLE_DEVICES" \
  -e OP_EVAL_MSPROF=1 \
  -e OP_EVAL_MSPROF_KEEP=1 \
  -e OP_EVAL_MSPROF_METRICS="ArithmeticUtilization,PipeUtilization,Memory,MemoryL0,MemoryUB,L2Cache,ResourceConflictRatio" \
  $HF_ENV \
  "$IMAGE_NAME"

echo "Server started in detached mode (Auto-Restart Enabled)."
echo "To view logs, run:"
echo "  docker logs -f $CONTAINER_NAME"
