# Target class for this configuration
_target_: verl.workers.config.VeOmniEngineConfig

# Whether to offload model parameters to CPU
param_offload: False

# Whether to offload optimizer state to CPU
optimizer_offload: False

# fsdp or fsdp2
data_parallel_mode: fsdp2

data_parallel_size: 1

data_parallel_replicate_size: 1

data_parallel_shard_size: 1

tensor_parallel_size: 1

expert_parallel_size: 1

pipeline_parallel_size: 1

context_parallel_size: 1

ulysses_parallel_size: 1

mixed_precision: true

# Random seed for reproducibility.
seed: 42

# Whether to enable full determinism for distributed training, only for debugging.
full_determinism: false

init_device: meta

enable_full_shard: true

ckpt_manager: dcp

# Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
# before the current forward computation.
forward_prefetch: true

strategy: veomni

# Whether to use torch compile in fsdp.
use_torch_compile: false

# Whether to use forward only in fsdp.
forward_only: false

enable_fsdp_offload: false

enable_reentrant: false

# support eager, sdpa, flash_attention_2, flash_attention_3, veomni_flash_attention_2_with_sp,
# veomni_flash_attention_3_with_sp and native-sparse
attn_implementation: flash_attention_2

# eager or fused
moe_implementation: fused

force_use_huggingface: false

activation_gpu_limit: 0.0
