#!/bin/bash
# nuke_it.sh — kill all Argus VLLM jobs and free GPUs

set -euo pipefail

echo "=== [1] Killing nohup driver process group ==="
NOHUP_PID=$(pgrep -f 'bash argus_vllm_evals\.sh' | head -n1 || true)
if [ -n "$NOHUP_PID" ]; then
  PGID=$(ps -o pgid= -p "$NOHUP_PID" | tr -d ' ')
  echo "Found nohup PID=$NOHUP_PID, PGID=$PGID"
  kill -TERM -"$PGID" 2>/dev/null || true
  sleep 2
  kill -KILL -"$PGID" 2>/dev/null || true
else
  echo "No nohup driver script found"
fi

echo "=== [2] Killing VLLM workers and servers ==="
pkill -f 'VLLM::Worker_TP'          || true
pkill -f 'argus_vllm_server\.py'    || true
pkill -f 'argus_vllm_evaluator\.py' || true

echo "=== [3] Killing IR/SQL servers ==="
pkill -f '/argus/.venv/bin/python -u server\.py' || true
pkill -f 'sql_server\.py'                             || true

echo "=== [4] Killing any processes still on GPUs (from nvidia-smi) ==="
for pid in $(nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null | tr -d ' ' | sort -u); do
  echo "Terminating PID $pid"
  kill -TERM "$pid" 2>/dev/null || true
done
sleep 2
for pid in $(nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null | tr -d ' ' | sort -u); do
  echo "Killing PID $pid"
  kill -KILL "$pid" 2>/dev/null || true
done

echo "=== [5] Double-check with fuser ==="
for pid in $(fuser /dev/nvidia* 2>/dev/null | tr ' ' '\n' | sort -u); do
  echo "Killing PID $pid from fuser"
  kill -KILL "$pid" 2>/dev/null || true
done

echo "=== [6] Freeing ports 5000/5001/8000 if still bound ==="
sudo fuser -k 5000/tcp 5001/tcp 8000/tcp 2>/dev/null || true

echo "=== All processes nuked. Run nvidia-smi to confirm ==="
