name: GPU CI

on:
  push:
    branches: [main]
  pull_request:
    branches: [main]

jobs:
  # GPU tests on self-hosted runner
  test-disaggregate-glm4-9B:
    runs-on: self-hosted
    container:
      image: zhuzilin/slime:latest
      options: --gpus all --ipc=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 --memory-swap=0 -v /data/models:/root/models -v /data/datasets:/root/datasets
    
    defaults:
      run:
        working-directory: ${{ github.workspace }}
    
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install slime
        run: |
          echo "📦 Installing slime..."
          cd $GITHUB_WORKSPACE
          echo "Current directory: $(pwd)"

          # Use cached pip packages
          pip install -e .
          echo "✅ slime installation completed"
        shell: bash

      - name: Download model and dataset
        run: |
          echo "🔗 Downloading up model and dataset..."
          
          # Create cache directories if they don't exist
          mkdir -p /root/models /root/datasets
          
          echo "Downloading GLM-Z1-9B-0414..."
          hf download zai-org/GLM-Z1-9B-0414 --local-dir /root/models/GLM-Z1-9B-0414
          
          hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/datasets/dapo-math-17k

          hf download --repo-type dataset zhuzilin/aime-2024 --local-dir /root/datasets/aime-2024
        shell: bash

      - name: Convert checkpoint
        run: |
          echo "🔄 Converting model checkpoint..."
          cd $GITHUB_WORKSPACE
          echo "Current directory: $(pwd)"

          source scripts/models/glm4-9B.sh
          PYTHONPATH=/root/Megatron-LM torchrun --nproc-per-node 8 tools/convert_hf_to_torch_dist.py \
            ${MODEL_ARGS[@]} \
            --hf-checkpoint /root/models/GLM-Z1-9B-0414 \
            --save /root/GLM-Z1-9B-0414_torch_dist
        shell: bash

      - name: Run tests
        run: |
          echo "🧪 Running tests..."
          cd $GITHUB_WORKSPACE
          echo "Current directory: $(pwd)"

          bash tests/test_quick_start_glm4-9B.sh
        shell: bash
          
      - name: Cleanup
        if: always()
        run: |
          echo "🧹 Cleaning up..."
          pkill -9 ray || true
          ray stop --force || true
          pkill -9 python || true
        shell: bash

  test-colocate-qwen3-30B-A3B:
    needs: test-disaggregate-glm4-9B
    runs-on: self-hosted
    container:
      image: zhuzilin/slime:latest
      options: --gpus all --ipc=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 --memory-swap=0 -v /data/models:/root/models -v /data/datasets:/root/datasets
    
    defaults:
      run:
        working-directory: ${{ github.workspace }}
    
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Install slime
        run: |
          echo "📦 Installing slime..."
          cd $GITHUB_WORKSPACE
          echo "Current directory: $(pwd)"

          # Use cached pip packages
          pip install -e .
          echo "✅ slime installation completed"
        shell: bash

      - name: Download model and dataset
        run: |
          echo "🔗 Downloading up model and dataset..."
          
          # Create cache directories if they don't exist
          mkdir -p /root/models /root/datasets
          
          echo "Downloading Qwen3-30B-A3B..."
          hf download Qwen/Qwen3-30B-A3B --local-dir /root/models/Qwen3-30B-A3B
          hf download Qwen/Qwen3-30B-A3B-FP8 --local-dir /root/models/Qwen3-30B-A3B-FP8
          
          hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/datasets/dapo-math-17k

          hf download --repo-type dataset zhuzilin/aime-2024 --local-dir /root/datasets/aime-2024
        shell: bash

      - name: Convert checkpoint
        run: |
          echo "🔄 Converting model checkpoint..."
          cd $GITHUB_WORKSPACE
          echo "Current directory: $(pwd)"

          source scripts/models/qwen3-30B-A3B.sh
          PYTHONPATH=/root/Megatron-LM torchrun --nproc-per-node 8 tools/convert_hf_to_torch_dist.py \
            ${MODEL_ARGS[@]} \
            --hf-checkpoint /root/models/Qwen3-30B-A3B \
            --save /root/Qwen3-30B-A3B_torch_dist
        shell: bash

      - name: Run tests
        run: |
          echo "🧪 Running tests..."
          cd $GITHUB_WORKSPACE
          echo "Current directory: $(pwd)"

          bash tests/test_qwen3-30B-A3B.sh
        shell: bash
          
      - name: Cleanup
        if: always()
        run: |
          echo "🧹 Cleaning up..."
          pkill -9 ray || true
          ray stop --force || true
          pkill -9 python || true
        shell: bash
