#!/bin/bash
# Script to run the op_eval_server container with NVIDIA GPU mapping
# Defaults assume 8xA100 on the host (override via CUDA_VISIBLE_DEVICES)
set -e

IMAGE_NAME="op_eval_server_cuda:latest"
MAX_OPS_PER_GPU=1
GPU_IDS_DEFAULT="0,1,2,3,4,5,6,7"
GPU_IDS="${CUDA_VISIBLE_DEVICES:-$GPU_IDS_DEFAULT}"
DEVICES_DEFAULT=$(echo "$GPU_IDS" | awk -F',' '{print NF}')
DEVICES="${DEVICES:-$DEVICES_DEFAULT}"
CONTAINER_NAME="op_eval_server_cuda"
HOST_ARTIFACTS_DIR="${OP_EVAL_ARTIFACTS_HOST_DIR:-/tmp/op_eval_artifacts_cuda}"
CONTAINER_ARTIFACTS_DIR="/tmp/op_eval_artifacts"

echo "Stopping any existing container named $CONTAINER_NAME..."
docker stop $CONTAINER_NAME 2>/dev/null || true
docker rm $CONTAINER_NAME 2>/dev/null || true

echo "Starting $CONTAINER_NAME..."
echo "Resetting artifacts dir: $HOST_ARTIFACTS_DIR"
rm -rf "$HOST_ARTIFACTS_DIR"
mkdir -p "$HOST_ARTIFACTS_DIR/_tmp" "$HOST_ARTIFACTS_DIR/_home"

GPU_FLAG="device=$GPU_IDS"

docker run -d \
  --name "$CONTAINER_NAME" \
  --restart unless-stopped \
  --net=host \
  --ipc=host \
  --gpus "$GPU_FLAG" \
  -v "$HOST_ARTIFACTS_DIR":"$CONTAINER_ARTIFACTS_DIR" \
  -e OP_EVAL_SERVER_TMP="$CONTAINER_ARTIFACTS_DIR" \
  -e HOME="$CONTAINER_ARTIFACTS_DIR/_home" \
  -e TMPDIR="$CONTAINER_ARTIFACTS_DIR/_tmp" \
  -e TMP="$CONTAINER_ARTIFACTS_DIR/_tmp" \
  -e TEMP="$CONTAINER_ARTIFACTS_DIR/_tmp" \
  -e MAX_OPS_PER_NPU="$MAX_OPS_PER_GPU" \
  -e CUDA_VISIBLE_DEVICES="$GPU_IDS" \
  -e CUDA_DEVICE_ORDER=PCI_BUS_ID \
  "$IMAGE_NAME" \
  op_eval_server --host 0.0.0.0 --port 5000 --backend cuda --devices "$DEVICES" --max-ops-per-npu "$MAX_OPS_PER_GPU"

echo "Server started in detached mode (Auto-Restart Enabled)."
echo "To view logs, run:"
echo "  docker logs -f $CONTAINER_NAME"
