_wandb:
    value:
        cli_version: 0.19.9
        m: []
        python_version: 3.10.12
        t:
            "1":
                - 1
                - 5
                - 11
                - 30
                - 41
                - 49
                - 50
                - 51
                - 53
                - 55
                - 71
                - 95
                - 98
                - 105
            "2":
                - 1
                - 5
                - 11
                - 30
                - 41
                - 49
                - 50
                - 51
                - 53
                - 55
                - 71
                - 95
                - 98
                - 105
            "3":
                - 2
                - 13
                - 16
                - 23
                - 55
            "4": 3.10.12
            "5": 0.19.9
            "6": 4.51.1
            "8":
                - 5
            "12": 0.19.9
            "13": linux-x86_64
actor_rollout_ref:
    value:
        actor:
            checkpoint:
                contents:
                    - model
                    - optimizer
                    - extra
            clip_ratio: 0.2
            clip_ratio_c: 3
            clip_ratio_high: 0.2
            clip_ratio_low: 0.2
            entropy_coeff: 0
            fsdp_config:
                fsdp_size: -1
                offload_policy: false
                optimizer_offload: true
                param_offload: true
                reshard_after_forward: true
                wrap_policy:
                    min_num_params: 0
            grad_clip: 1
            kl_loss_coef: 0
            kl_loss_type: low_var_kl
            loss_agg_mode: token-mean
            optim:
                lr: 1e-06
                lr_warmup_steps: -1
                lr_warmup_steps_ratio: 0
                min_lr_ratio: 0
                num_cycles: 0.5
                total_training_steps: 932
                warmup_style: constant
                weight_decay: 0.01
            ppo_epochs: 1
            ppo_max_token_len_per_gpu: 14196
            ppo_micro_batch_size: null
            ppo_micro_batch_size_per_gpu: 1
            ppo_mini_batch_size: 4
            shuffle: false
            strategy: fsdp
            ulysses_sequence_parallel_size: 1
            use_dynamic_bsz: true
            use_kl_loss: false
            use_torch_compile: true
        hybrid_engine: true
        model:
            enable_activation_offload: false
            enable_gradient_checkpointing: true
            external_lib: null
            lora_alpha: 16
            lora_rank: 0
            path: /models/Qwen3-0.6B
            target_modules: all-linear
            trust_remote_code: false
            use_fused_kernels: false
            use_liger: false
            use_remove_padding: true
            use_shm: false
        ref:
            fsdp_config:
                param_offload: true
                reshard_after_forward: true
                wrap_policy:
                    min_num_params: 0
            log_prob_max_token_len_per_gpu: 14196
            log_prob_micro_batch_size: null
            log_prob_micro_batch_size_per_gpu: 1
            log_prob_use_dynamic_bsz: true
            strategy: fsdp
            ulysses_sequence_parallel_size: 1
            use_torch_compile: true
        rollout:
            disable_log_stats: true
            do_sample: true
            dtype: bfloat16
            enable_chunked_prefill: true
            enforce_eager: true
            engine_kwargs:
                sglang:
                    attention_backend: null
                vllm:
                    swap_space: null
            free_cache_engine: true
            gpu_memory_utilization: 0.3
            ignore_eos: false
            layered_summon: false
            load_format: dummy_dtensor
            log_prob_max_token_len_per_gpu: 14196
            log_prob_micro_batch_size: null
            log_prob_micro_batch_size_per_gpu: 1
            log_prob_use_dynamic_bsz: true
            max_model_len: null
            max_num_batched_tokens: 14196
            max_num_seqs: 1024
            mode: sync
            multi_turn:
                completion_callback: null
                enable: false
                format: chatml
                max_turns: null
                tool_config_path: null
            "n": 4
            name: vllm
            prompt_length: 512
            response_length: 2048
            temperature: 1
            tensor_model_parallel_size: 4
            top_k: -1
            top_p: 1
            use_fire_sampling: false
            val_kwargs:
                do_sample: false
                "n": 1
                temperature: 0
                top_k: -1
                top_p: 1
algorithm:
    value:
        adv_estimator: grpo
        gamma: 1
        kl_ctrl:
            horizon: 10000
            kl_coef: 0
            target_kl: 0.1
            type: fixed
        kl_penalty: kl
        lam: 1
        norm_adv_by_std_in_grpo: true
        pf_ppo:
            reweight_method: pow
            weight_pow: 2
        use_kl_in_reward: false
        use_pf_ppo: false
critic:
    value:
        checkpoint:
            contents:
                - model
                - optimizer
                - extra
        cliprange_value: 0.5
        forward_max_token_len_per_gpu: 32768
        forward_micro_batch_size: null
        forward_micro_batch_size_per_gpu: null
        grad_clip: 1
        loss_agg_mode: token-mean
        model:
            enable_activation_offload: false
            enable_gradient_checkpointing: true
            external_lib: null
            fsdp_config:
                fsdp_size: -1
                offload_policy: false
                optimizer_offload: false
                param_offload: false
                reshard_after_forward: true
                wrap_policy:
                    min_num_params: 0
            lora_alpha: 16
            lora_rank: 0
            path: ~/models/deepseek-llm-7b-chat
            target_modules: all-linear
            tokenizer_path: /models/Qwen3-0.6B
            trust_remote_code: false
            use_remove_padding: false
            use_shm: false
        optim:
            lr: 1e-05
            lr_warmup_steps_ratio: 0
            min_lr_ratio: null
            total_training_steps: 932
            warmup_style: constant
            weight_decay: 0.01
        ppo_epochs: 1
        ppo_max_token_len_per_gpu: 32768
        ppo_micro_batch_size: null
        ppo_micro_batch_size_per_gpu: null
        ppo_mini_batch_size: 4
        rollout_n: 4
        shuffle: false
        strategy: fsdp
        ulysses_sequence_parallel_size: 1
        use_dynamic_bsz: true
custom_reward_function:
    value:
        name: compute_score
        path: null
data:
    value:
        custom_cls:
            name: null
            path: null
        filter_overlong_prompts: true
        filter_overlong_prompts_workers: 1
        image_key: images
        max_prompt_length: 512
        max_response_length: 2048
        prompt_key: prompt
        return_full_prompt: false
        return_raw_chat: false
        return_raw_input_ids: false
        reward_fn_key: data_source
        shuffle: true
        tokenizer: null
        train_batch_size: 32
        train_files: /root/data/gsm8k/train.parquet
        truncation: error
        trust_remote_code: false
        use_shm: false
        val_batch_size: 4
        val_files: /root/data/gsm8k/test.parquet
        validation_shuffle: false
        video_key: videos
ray_init:
    value:
        num_cpus: null
        timeline_json_file: null
reward_model:
    value:
        enable: false
        forward_max_token_len_per_gpu: 32768
        launch_reward_fn_async: false
        max_length: null
        micro_batch_size: null
        micro_batch_size_per_gpu: null
        model:
            external_lib: null
            fsdp_config:
                fsdp_size: -1
                param_offload: false
                reshard_after_forward: true
                wrap_policy:
                    min_num_params: 0
            input_tokenizer: /models/Qwen3-0.6B
            path: ~/models/FsfairX-LLaMA3-RM-v0.1
            trust_remote_code: false
            use_fused_kernels: false
            use_remove_padding: false
            use_shm: false
        reward_manager: naive
        sandbox_fusion:
            max_concurrent: 64
            url: null
        strategy: fsdp
        ulysses_sequence_parallel_size: 1
        use_dynamic_bsz: true
trainer:
    value:
        balance_batch: true
        critic_warmup: 0
        default_hdfs_dir: null
        default_local_dir: /nfs100/liuhuanyu/Sat_point/Qwen3-0.6B/save_model
        del_local_ckpt_after_load: false
        device: cuda
        experiment_name: qwen3_test_nokl
        log_val_generations: 0
        logger:
            - console
            - wandb
        max_actor_ckpt_to_keep: null
        max_critic_ckpt_to_keep: null
        n_gpus_per_node: 8
        nnodes: 1
        project_name: sat_point
        ray_wait_register_center_timeout: 300
        resume_from_path: null
        resume_mode: auto
        rollout_data_dir: null
        save_freq: 40
        test_freq: 20
        total_epochs: 4
        total_training_steps: null
        val_before_train: true
        validation_data_dir: null
