# UME-R1: Exploring Reasoning-Driven Generative Multimodal Embeddings

## 🛠️ Setup

```bash
conda create -n ume-r1 python=3.10
conda activate ume-r1
bash setup.sh
```

## 💪🏻 Training

### 📚 SFT
1. Please refer to `UME-R1/src/eval/VLM2Vec/experiments/public/data/download_data.sh` to download the image and video.

2. Download our released JSON dataset containing CoT annotations (the dataset is large and will be open-sourced later).

3. define the dataset path in `UME-R1/src/sft-train/qwenvl/data/__init__.py`
```python
MMEB_V2_GROUP = {
    "annotation_path": "/data/your_dataset/annotations.json",
    "data_path": "/data/your_dataset/images/",
}
```
4. Run the following command to train the SFT model.
```bash
export prefix=your_path
export llm=Qwen/Qwen2-VL-2B-Instruct
export run_name=UME-2B

# DeepSpeed configuration
deepspeed=$prefix/UME-R1/src/sft-train/scripts/zero3.json

# Training hyperparameters
lr=5e-5
batch_size=4
grad_accum_steps=4

# Training entry point
entry_file=$prefix/UME-R1/src/sft-train/qwenvl/train/train_qwen.py

# Dataset configuration (replace with public dataset names)
datasets=mmeb_v2_group

# Output configuration
output_dir=$prefix/output/$run_name

# Training arguments
args="
    --deepspeed ${deepspeed} \
    --model_name_or_path "${llm}" \
    --dataset_use ${datasets} \
    --data_flatten False \
    --tune_mm_vision False \
    --tune_mm_mlp True \
    --tune_mm_llm True \
    --output_dir ${output_dir} \
    --max_steps 5000 \
    --data_group True \
    --bf16 \
    --per_device_train_batch_size ${batch_size} \
    --per_device_eval_batch_size $((batch_size*2)) \
    --gradient_accumulation_steps ${grad_accum_steps} \
    --max_pixels 2359296 \
    --min_pixels 768 \
    --eval_strategy "no" \
    --save_strategy "steps" \
    --save_steps 500 \
    --save_total_limit 10 \
    --learning_rate ${lr} \
    --weight_decay 0 \
    --warmup_ratio 0.03 \
    --max_grad_norm 1 \
    --lr_scheduler_type "cosine" \
    --logging_steps 1 \
    --model_max_length 12288 \
    --gradient_checkpointing True \
    --dataloader_num_workers 4 \
    --run_name ${run_name} \
    --report_to none"

# Launch training
torchrun --node_rank=$RANK \
         --nnodes=8 \
         --nproc_per_node=8 \
         --master_addr=${MASTER_ADDR} \
         --master_port=${MASTER_PORT} \
         ${entry_file} ${args} \
        >>$prefix/log/$run_name.log 2>&1
```

### 📚 GRPO

1. Download our released JSON dataset (the dataset will be open-sourced later).

2. Write the path of the annotation files in the `UME-R1/src/r1-train/data_config/embed.yaml` file.
```bash
datasets:
    - json_path: /path/to/UME-R1-RL.json
```

3. Run the following command to train the RL model.

> [!NOTE] 
> If you encounter 'CUDA out of memory' error, you can try to reduce the `per_device_train_batch_size`.

```bash
export prefix=your_path
export MODEL_NAME=UME-2B
export RUN_NAME=UME-R1-2B
torchrun --node_rank=${RANK} \
    --nproc_per_node=8 \
    --nnodes=8 \
    --master_addr=${MASTER_ADDR} \
    --master_port=${MASTER_PORT} \
    $prefix/UME-R1/src/r1-train/src/open_r1/grpo_embed.py \
    --deepspeed $prefix/UME-R1/src/r1-train/local_scripts/zero3.json \
    --output_dir $prefix/output/$RUN_NAME \
    --model_name_or_path $MODEL_NAME \
    --dataset_name $prefix/UME-R1/src/r1-train/data_config/embed.yaml \
    --image_root " " \
    --max_prompt_length 1024 \
    --max_completion_length 2048 \
    --beta 0.04 \
    --epsilon_high 0.28 \
    --learning_rate 1e-6 \
    --num_generations 8 \
    --temperature 1 \
    --per_device_train_batch_size 2 \
    --gradient_accumulation_steps 2 \
    --logging_steps 1 \
    --bf16 \
    --torch_dtype bfloat16 \
    --data_seed 42 \
    --report_to none \
    --gradient_checkpointing true \
    --attn_implementation flash_attention_2 \
    --num_train_epochs 1 \
    --run_name $RUN_NAME \
    --max_pixels 2359296 \
    --save_steps 50 \
    --save_only_model true \
    >>$prefix/log/$RUN_NAME.log 2>&1
```

## 📊 Evaluation


1. Please refer to `UME-R1/src/eval/VLM2Vec/experiments/public/data/download_data.sh` to download the image and video.

2. Run the following command to eval the model.


```bash
export prefix=your_path

cd $prefix/UME-R1/src/eval/VLM2Vec || exit

# ==============================================================================
# Configuration
# ==============================================================================
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
# BATCH_SIZE=4
MODALITIES=("image" "video" "visdoc")


declare -A BATCH_SIZES
BATCH_SIZES=( ["visdoc"]=4 ["video"]=2 ["image"]=4 )

MODE="gen"  # gen or disc

DATA_BASEDIR="$prefix/data/MMEB-V2" 
OUTPUT_BASEDIR="$prefix/mmeb-output-v2/"

RUN_NAME="UME-R1"

# ==> Define models and their base output paths here
# Format: "MODEL_NAME;BASE_OUTPUT_PATH"
declare -a MODEL_SPECS
MODEL_SPECS+=( "$prefix/output/$RUN_NAME;qwen2_vl;$OUTPUT_BASEDIR/$RUN_NAME" )


# ==============================================================================
# Main Execution Loop
# ==============================================================================
# Loop through each model specification
for spec in "${MODEL_SPECS[@]}"; do
  # Parse the model name and base output path from the spec string
  IFS=';' read -r MODEL_NAME MODEL_BACKBONE BASE_OUTPUT_PATH <<< "$spec"

  echo "================================================="
  echo "🚀 Processing Model: $MODEL_NAME"
  echo "================================================="

  # Loop through each modality for the current model
  for MODALITY in "${MODALITIES[@]}"; do
    DATA_CONFIG_PATH="$prefix/UME-R1/src/eval/VLM2Vec/experiments/public/eval/$MODALITY.yaml"
    OUTPUT_PATH="$BASE_OUTPUT_PATH/$MODALITY/"
    BATCH_SIZE=${BATCH_SIZES[$MODALITY]} 
    echo "-------------------------------------------------"
    echo "  - Modality: $MODALITY"
    echo "  - Output Path: $OUTPUT_PATH"
    echo "  - Batch Size: $BATCH_SIZE"

    # Ensure the output directory exists
    mkdir -p $OUTPUT_PATH
    echo "  - Executing command..."

    torchrun --node_rank=$RANK \
      --nnodes=1 \
      --nproc_per_node=8 \
      --master_addr=${MASTER_ADDR} \
      --master_port=${MASTER_PORT} \
      eval_twomode.py \
      --per_device_eval_batch_size $BATCH_SIZE \
      --model_backbone $MODEL_BACKBONE \
      --model_name $MODEL_NAME \
      --dataset_config $DATA_CONFIG_PATH \
      --encode_output_path $OUTPUT_PATH \
      --data_basedir $DATA_BASEDIR \
      --max_new_tokens 8192 \
      --resize_max_pixels 2359296 \
      --resize_min_pixels 784 \
      --qry_mode $MODE \
      --tgt_mode $MODE \
      >>$prefix/eval_log/${RUN_NAME}_{$MODE}_v2.log 2>&1

    echo "  - Done."
    echo "-------------------------------------------------"
  done
done

echo "✅ All jobs completed."
```


