# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
FSDP PPO Trainer with Ray-based single controller.
This trainer supports model-agonistic model initialization with huggingface
"""
import re
import json
import os
import uuid
from collections import defaultdict
from copy import deepcopy
from pprint import pprint
from typing import Optional

import numpy as np
import torch
from tqdm import tqdm

from verl.trainer.ppo.ray_trainer import (
    AdvantageEstimator,
    RayPPOTrainer,
    apply_kl_penalty,
    compute_advantage,
    compute_response_mask,
)


from verl.experimental.dataset.sampler import AbstractCurriculumSampler
from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
from verl.trainer.ppo.core_algos import AdvantageEstimator, agg_loss
from verl.trainer.ppo.metric_utils import (
    compute_throughout_metrics,
    compute_timing_metrics,
    reduce_metrics,
)
from verl.trainer.ppo.reward import compute_reward, compute_reward_async
from verl.utils.checkpoint.checkpoint_manager import find_latest_ckpt_path, should_save_ckpt_esi
from verl.utils.debug import marked_timer
from verl.utils.metric import (
    reduce_metrics,
)
import random
import time
from concurrent.futures import ThreadPoolExecutor
from verl.trainer.ppo.utils import Role
from verl.single_controller.ray import RayClassWithInitArgs
from verl.utils.config import omega_conf_to_dataclass
from omegaconf import OmegaConf, open_dict
from verl.single_controller.ray.base import create_colocated_worker_cls
from verl.utils.rollout_skip import RolloutSkip

from .agent_metric_utils import compute_val_data_metrics, statistic_text_code_info, process_validation_metrics, compute_data_metrics
from .agent_protocol import Agent_DataProto as DataProto

class RayDAPOTrainerPatialRollout(RayPPOTrainer):
    """
    Note that this trainer runs on the driver process on a single CPU/GPU node.
    """
    def init_workers(self):
        """Initialize distributed training workers using Ray backend.

        Creates:
        1. Ray resource pools from configuration
        2. Worker groups for each role (actor, critic, etc.)
        """
        self.resource_pool_manager.create_resource_pool()

        self.resource_pool_to_cls = {pool: {} for pool in self.resource_pool_manager.resource_pool_dict.values()}

        # create actor and rollout
        if self.hybrid_engine:
            resource_pool = self.resource_pool_manager.get_resource_pool(Role.ActorRollout)
            actor_rollout_cls = RayClassWithInitArgs(
                cls=self.role_worker_mapping[Role.ActorRollout],
                config=self.config.actor_rollout_ref,
                role="actor_rollout",
            )
            self.resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
        else:
            raise NotImplementedError

        # create critic
        if self.use_critic:
            resource_pool = self.resource_pool_manager.get_resource_pool(Role.Critic)
            critic_cfg = omega_conf_to_dataclass(self.config.critic)
            critic_cls = RayClassWithInitArgs(cls=self.role_worker_mapping[Role.Critic], config=critic_cfg)
            self.resource_pool_to_cls[resource_pool]["critic"] = critic_cls

        # create reference policy if needed
        if self.use_reference_policy:
            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RefPolicy)
            ref_policy_cls = RayClassWithInitArgs(
                self.role_worker_mapping[Role.RefPolicy],
                config=self.config.actor_rollout_ref,
                role="ref",
            )
            self.resource_pool_to_cls[resource_pool]["ref"] = ref_policy_cls

        # create a reward model if reward_fn is None
        if self.use_rm:
            # we create a RM here
            resource_pool = self.resource_pool_manager.get_resource_pool(Role.RewardModel)
            rm_cls = RayClassWithInitArgs(self.role_worker_mapping[Role.RewardModel], config=self.config.reward_model)
            self.resource_pool_to_cls[resource_pool]["rm"] = rm_cls

        all_wg = {}
        wg_kwargs = {}  # Setting up kwargs for RayWorkerGroup
        if OmegaConf.select(self.config.trainer, "ray_wait_register_center_timeout") is not None:
            wg_kwargs["ray_wait_register_center_timeout"] = self.config.trainer.ray_wait_register_center_timeout
        if OmegaConf.select(self.config.global_profiler, "steps") is not None:
            wg_kwargs["profile_steps"] = OmegaConf.select(self.config.global_profiler, "steps")
            # Only require nsight worker options when tool is nsys
            if OmegaConf.select(self.config.global_profiler, "tool") == "nsys":
                assert (
                    OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options")
                    is not None
                ), "worker_nsight_options must be set when using nsys with profile_steps"
                wg_kwargs["worker_nsight_options"] = OmegaConf.to_container(
                    OmegaConf.select(self.config.global_profiler.global_tool_config.nsys, "worker_nsight_options")
                )
        wg_kwargs["device_name"] = self.device_name

        for resource_pool, class_dict in self.resource_pool_to_cls.items():
            worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
            wg_dict = self.ray_worker_group_cls(
                resource_pool=resource_pool,
                ray_cls_with_init=worker_dict_cls,
                **wg_kwargs,
            )
            spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
            all_wg.update(spawn_wg)

        if self.use_critic:
            self.critic_wg = all_wg["critic"]
            self.critic_wg.init_model()

        if self.use_reference_policy and not self.ref_in_actor:
            self.ref_policy_wg = all_wg["ref"]
            self.ref_policy_wg.init_model()

        self.rm_wg = None
        if self.use_rm:
            self.rm_wg = all_wg["rm"]
            self.rm_wg.init_model()

        # we should create rollout at the end so that vllm can have a better estimation of kv cache memory
        self.actor_rollout_wg = all_wg["actor_rollout"]
        self.actor_rollout_wg.init_model()

        # create async rollout manager and request scheduler
        self.async_rollout_mode = False
        if self.config.actor_rollout_ref.rollout.mode == "async":
            from .agent_loop.agent_math_loop import AgentLoopManager
            self.async_rollout_mode = True
            self.async_rollout_manager = AgentLoopManager(
                config=self.config, worker_group=self.actor_rollout_wg, rm_wg=self.rm_wg
            )
    def _dump_generations(self, inputs, outputs, scores, reward_extra_infos_dict, dump_path, temperature=1.0):
        """Dump rollout/validation samples as JSONL."""
        os.makedirs(dump_path, exist_ok=True)
        filename = os.path.join(dump_path, f"step_{self.global_steps}_t{temperature}.jsonl")

        n = len(inputs)
        base_data = {
            "input": inputs,
            "output": outputs,
            "score": scores,
            "step": [self.global_steps] * n,
        }

        for k, v in reward_extra_infos_dict.items():
            if len(v) == n:
                base_data[k] = v

        lines = []
        for i in range(n):
            entry = {k: v[i] for k, v in base_data.items()}
            lines.append(json.dumps(entry, ensure_ascii=False))

        with open(filename, "w") as f:
            f.write("\n".join(lines) + "\n")

        print(f"Dumped generations to {filename}")

    def _validate_with_temperatures_sync(self, temperatures=[0.6,1.0]):
        val_metrics_suffixed = {}
        results = []
        for temp in temperatures:
            outputs = self._validate(temperature=temp)
            results.append(outputs)

        for output_metrix, temp in zip(results, temperatures):
            val_metrics_suffixed.update({f'{k}_t{temp}': v for k, v in output_metrix.items()})

        return val_metrics_suffixed

    def _validate_with_temperatures(self, temperatures: list[float] | tuple[float, ...] = (0.6, 1.0), max_workers: int | None = None,) -> dict[str, float]:

        if temperatures is None:
            temperatures = [self.config.actor_rollout_ref.rollout.val_kwargs.temperature]

        print(f"temperatures ==== {temperatures}")
        start = time.time()
        from concurrent.futures import as_completed
        max_workers = max_workers or min(len(temperatures), (os.cpu_count() or 1))
        val_metrics_suffixed: dict[str, float] = {}

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            self.config.actor_rollout_ref.rollout.free_cache_engine_sleep = not self.config.actor_rollout_ref.rollout.free_cache_engine
            future2temp = {
                executor.submit(self._validate, temperature=t): t
                for t in temperatures
            }
            for idx, future in enumerate(as_completed(future2temp)):
                temp = future2temp[future]
                output_metrics = future.result()
                val_metrics_suffixed.update(
                    {f"t{temp}_{k}": v for k, v in output_metrics.items()}
                )
                if idx == len(future2temp) - 1:
                    self.config.actor_rollout_ref.rollout.free_cache_engine_sleep = self.config.actor_rollout_ref.rollout.free_cache_engine

            self.config.actor_rollout_ref.rollout.free_cache_engine_sleep = self.config.actor_rollout_ref.rollout.free_cache_engine
            if self.async_rollout_mode:
                self.async_rollout_manager.sleep()
                self.async_rollout_manager.clear_cache()

        end = time.time()
        print(f"\n _validate_with_temperatures() ======= {end - start}\n")
        return val_metrics_suffixed

    def _validate(self, temperature=None):
        start = time.time()
        if temperature is None:
            temperature = self.config.actor_rollout_ref.rollout.val_kwargs.temperature
        data_source_lst = []
        reward_extra_infos_dict: dict[str, list] = defaultdict(list)

        # Lists to collect samples for the table
        uid_lists = []
        sample_inputs = []
        sample_outputs = []
        sample_scores = []
        sample_turns = []

        for test_data in self.val_dataloader:
            test_batch = DataProto.from_single_dict(test_data)
            test_batch.non_tensor_batch["uid"] = np.array(
                [str(uuid.uuid4()) for _ in range(len(test_batch.batch))], dtype=object
            )
            # repeat test batch
            test_batch = test_batch.repeat(
                repeat_times=self.config.actor_rollout_ref.rollout.val_kwargs.n, interleave=True
            )

            # we only do validation on rule-based rm
            if self.config.reward_model.enable and test_batch[0].non_tensor_batch["reward_model"]["style"] == "model":
                return {}

            # Store original inputs
            input_ids = test_batch.batch["input_ids"]
            # TODO: Can we keep special tokens except for padding tokens?
            input_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
            batch_uids = test_batch.non_tensor_batch["uid"]
            uid_lists.extend(batch_uids)
            sample_inputs.extend(input_texts)
            batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
            non_tensor_batch_keys_to_pop = ["raw_prompt_ids"]
            if "multi_modal_data" in test_batch.non_tensor_batch:
                non_tensor_batch_keys_to_pop.append("multi_modal_data")
            if "raw_prompt" in test_batch.non_tensor_batch:
                non_tensor_batch_keys_to_pop.append("raw_prompt")
            if "tools_kwargs" in test_batch.non_tensor_batch:
                non_tensor_batch_keys_to_pop.append("tools_kwargs")
            if "interaction_kwargs" in test_batch.non_tensor_batch:
                non_tensor_batch_keys_to_pop.append("interaction_kwargs")
            if "agent_name" in test_batch.non_tensor_batch:
                non_tensor_batch_keys_to_pop.append("agent_name")
            test_gen_batch = test_batch.pop(
                batch_keys=batch_keys_to_pop,
                non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
            )

            test_gen_batch.meta_info = {
                "eos_token_id": self.tokenizer.eos_token_id,
                "pad_token_id": self.tokenizer.pad_token_id,
                "recompute_log_prob": False,
                "do_sample": self.config.actor_rollout_ref.rollout.val_kwargs.do_sample,
                "validate": True,
                "global_steps": self.global_steps,
                "use_format_reward": self.config.reward_model.use_format_reward,
                "valid_temperature": temperature,
                "use_code_reward": False
            }
            print(f"test_gen_batch meta info: {test_gen_batch.meta_info}")

            # pad to be divisible by dp_size
            size_divisor = (
                self.actor_rollout_wg.world_size
                if not self.async_rollout_mode
                else self.config.actor_rollout_ref.rollout.agent.num_workers
            )
            test_gen_batch_padded, pad_size = pad_dataproto_to_divisor(test_gen_batch, size_divisor)
            if not self.async_rollout_mode:
                test_output_gen_batch_padded = self.actor_rollout_wg.generate_sequences(test_gen_batch_padded)
            else:
                test_output_gen_batch_padded = self.async_rollout_manager.generate_sequences(test_gen_batch_padded)

            test_output_gen_batch = unpad_dataproto(test_output_gen_batch_padded, pad_size=pad_size)
            test_output_gen_batch.meta_info = test_gen_batch.meta_info
            print("validation generation end")

            # Store generated outputs
            output_ids = test_output_gen_batch.batch["responses"]
            output_texts = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
            sample_outputs.extend(output_texts)

            test_batch = test_batch.union(test_output_gen_batch)
            test_batch.meta_info["validate"] = True

            # evaluate using reward_function
            result = self.val_reward_fn(test_batch, return_dict=True)
            reward_tensor = result["reward_tensor"]
            test_batch.batch["token_level_scores"] = reward_tensor
            test_batch.batch["token_level_rewards"] = reward_tensor
            scores = reward_tensor.sum(-1).cpu().tolist()
            sample_scores.extend(scores)

            reward_extra_infos_dict["reward"].extend(scores)
            print(f"len reward_extra_infos_dict['reward']: {len(reward_extra_infos_dict['reward'])}")
            if "reward_extra_info" in result:
                for key, lst in result["reward_extra_info"].items():
                    reward_extra_infos_dict[key].extend(lst)
                    print(f"len reward_extra_infos_dict['{key}']: {len(reward_extra_infos_dict[key])}")

            # collect num_turns of each prompt
            if "__num_turns__" in test_batch.non_tensor_batch:
                sample_turns.append(test_batch.non_tensor_batch["__num_turns__"])

            data_source_lst.append(test_batch.non_tensor_batch.get("data_source", ["unknown"] * reward_tensor.shape[0]))
        self._maybe_log_val_generations(inputs=sample_inputs, outputs=sample_outputs, scores=sample_scores)

        metric_dict = {}
        val_data_dir = self.config.trainer.get("validation_data_dir", None)

        max_turns_num = self.config.actor_rollout_ref.rollout.multi_turn.val_max_assistant_turns

        if val_data_dir:
            self._dump_generations(
                inputs=sample_inputs,
                outputs=sample_outputs,
                scores=sample_scores,
                reward_extra_infos_dict=reward_extra_infos_dict,
                dump_path=val_data_dir,
                temperature=temperature
            )

        metrix_codes_info = statistic_text_code_info(sample_inputs, sample_outputs, uid_lists, sample_scores, max_turns_num, save_prefix="val/code_info")

        metric_dict.update(metrix_codes_info)
        metric_dict.update(compute_val_data_metrics(batch=test_batch))

        for key_info, lst in reward_extra_infos_dict.items():
            assert len(lst) == 0 or len(lst) == len(sample_scores), f"{key_info}: {len(lst)=}, {len(sample_scores)=}"

        data_sources = np.concatenate(data_source_lst, axis=0)

        data_src2var2metric2val = process_validation_metrics(data_sources, sample_inputs, reward_extra_infos_dict)

        for data_source, var2metric2val in data_src2var2metric2val.items():
            core_var = "acc" if "acc" in var2metric2val else "reward"
            for var_name, metric2val in var2metric2val.items():
                n_max = max([int(name.split("@")[-1].split("/")[0]) for name in metric2val.keys()])
                for metric_name, metric_val in metric2val.items():
                    if (
                        (var_name == core_var)
                        and any(metric_name.startswith(pfx) for pfx in ["mean", "maj", "best"])
                        and (f"@{n_max}" in metric_name)
                    ):
                        metric_sec = "val-core"
                    else:
                        metric_sec = "val-aux"
                    pfx = f"{metric_sec}/{data_source}/{var_name}/{metric_name}"
                    metric_dict[pfx] = metric_val

        if len(sample_turns) > 0:
            sample_turns = np.concatenate(sample_turns)
            metric_dict["val-aux/num_turns/min"] = sample_turns.min()
            metric_dict["val-aux/num_turns/max"] = sample_turns.max()
            metric_dict["val-aux/num_turns/mean"] = sample_turns.mean()

        end = time.time()
        print(f"\ntest reward scores===={sample_scores}\n")

        print(f"\ntemperature ==== {temperature}, _validate() ======= {end - start}\n")
        return metric_dict

    def complement_batch_size(self, data_batch, complement_count):
        valid_length = len(data_batch)
        sample_idx_lists = random.sample(range(0,valid_length), k=complement_count)
        select_batch_items = data_batch[sample_idx_lists]
        return select_batch_items

    def select_data_train_uids(self, data_batch, required_rollouts, select_nums, is_random=True):
        num_uid_dict = defaultdict(int)
        for uid in data_batch.non_tensor_batch["uid"]:
            num_uid_dict[uid] += 1

        complete_uids_lists = [uid for uid, count in num_uid_dict.items() if count == required_rollouts]
        if is_random == True:
            select_uid_list = random.sample(complete_uids_lists, k=select_nums)
        else:
            select_uid_list = complete_uids_lists[0: select_nums]

        select_idxs = []
        for idx, uid in enumerate(data_batch.non_tensor_batch["uid"]):
            if uid in select_uid_list:
                select_idxs.append(idx)

        select_batch_items = data_batch[select_idxs]
        return select_batch_items


    def fit(self):
        """
        The training loop of PPO.
        The driver process only need to call the compute functions of the worker group through RPC
        to construct the PPO dataflow.
        The light-weight advantage computation is done on the driver process.
        """
        from omegaconf import OmegaConf

        from verl.utils.tracking import Tracking

        logger = Tracking(
            project_name=self.config.trainer.project_name,
            experiment_name=self.config.trainer.experiment_name,
            default_backend=self.config.trainer.logger,
            config=OmegaConf.to_container(self.config, resolve=True),
        )

        self.global_steps = 0
        self.gen_steps = 0

        # load checkpoint before doing anything
        self._load_checkpoint()

        # perform validation before training
        # currently, we only support validation using the reward_function.
        if self.val_reward_fn is not None and self.config.trainer.get("val_before_train", True):
            val_metrics = self._validate_with_temperatures(temperatures=self.config.actor_rollout_ref.rollout.val_temperature_lists) #[0.6, 1.0]
            assert val_metrics, f"{val_metrics=}"
            pprint(f"Initial validation metrics: {val_metrics}")
            logger.log(data=val_metrics, step=self.global_steps)
            if self.config.trainer.get("val_only", False):
                return


        if self.config.actor_rollout_ref.rollout.get("skip_rollout", False):

            rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)

            rollout_skip.wrap_generate_sequences()
        # add tqdm
        progress_bar = tqdm(total=self.total_training_steps, initial=self.global_steps, desc="Training Progress")

        # we start from step 1
        self.global_steps += 1
        self.gen_steps += 1
        last_val_metrics = None
        self.max_steps_duration = 0

        prev_step_profile = False
        curr_step_profile = (
            self.global_steps in self.config.global_profiler.steps
            if self.config.global_profiler.steps is not None
            else False
        )
        next_step_profile = False

        partial_batch: Optional[DataProto] = None  # samples whose rollout is not finished yet
        staged_batch: Optional[DataProto] = None  # samples whose rollout has been finished but not yet trained on

        timing_raw = defaultdict(float)
        batch = None
        num_prompt_in_batch = 0
        num_gen_batches = 0

        for epoch in range(self.config.trainer.total_epochs):
            for batch_dict in self.train_dataloader:
                metrics = {}

                with marked_timer("start_profile", timing_raw):
                    self._start_profiling(
                        not prev_step_profile and curr_step_profile
                        if self.config.global_profiler.profile_continuous_steps
                        else curr_step_profile
                    )

                new_batch: DataProto = DataProto.from_single_dict(batch_dict)

                new_batch.non_tensor_batch["uid"] = np.array(
                    [str(uuid.uuid4()) for _ in range(len(new_batch.batch))], dtype=object
                )
                # repeat to align with repeated responses in rollout
                new_batch = new_batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
                new_batch.non_tensor_batch["age"] = np.ones(len(new_batch.batch), dtype=int)
                new_batch.non_tensor_batch["raw_response_ids"] = np.fromiter(([] for _ in range(len(new_batch.batch))), dtype=object)
                new_batch.non_tensor_batch["raw_response_mask"] = np.fromiter(([] for _ in range(len(new_batch.batch))), dtype=object)
                new_batch = DataProto.concat_array([partial_batch, new_batch]) if partial_batch is not None else new_batch

                num_gen_batches += 1
                # pop those keys for generation
                batch_keys_to_pop = ["input_ids", "attention_mask", "position_ids"]
                non_tensor_batch_keys_to_pop = ["raw_prompt_ids", "raw_response_ids", "raw_response_mask"]
                if "multi_modal_data" in new_batch.non_tensor_batch:
                    non_tensor_batch_keys_to_pop.append("multi_modal_data")

                if "raw_prompt" in new_batch.non_tensor_batch:
                    non_tensor_batch_keys_to_pop.append("raw_prompt")

                if "agent_name" in new_batch.non_tensor_batch:
                    non_tensor_batch_keys_to_pop.append("agent_name")


                gen_batch = new_batch.pop(
                    batch_keys=batch_keys_to_pop,
                    non_tensor_batch_keys=non_tensor_batch_keys_to_pop,
                )

                retain_keys = ["tools_kwargs", "index", "interaction_kwargs", "age"]
                for r_k in retain_keys:
                    gen_batch.non_tensor_batch[r_k] = new_batch.non_tensor_batch[r_k]

                gen_batch.meta_info["global_steps"] = self.global_steps

                is_last_step = self.gen_steps >= self.total_training_steps

                with marked_timer("step", timing_raw):
                    print(f"\ngen_batch size ==== {len(gen_batch)}\n")

                    with marked_timer("gen", timing_raw, "red"):
                        if not self.async_rollout_mode:
                            gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch)
                        else:
                            gen_batch_output = self.async_rollout_manager.generate_sequences(gen_batch)
                        timing_raw.update(gen_batch_output.meta_info["timing"])
                        gen_batch_output.meta_info.pop("timing", None)

                    with marked_timer("filter", timing_raw):
                        new_batch = new_batch.union(gen_batch_output)

                        finished_mask = new_batch.non_tensor_batch.pop("finished")
                        finished_mask = (new_batch.non_tensor_batch["age"] == self.config.algorithm.partial_rollout_max_split) | finished_mask

                        finished_mask = np.asarray(finished_mask, dtype=bool)

                        print(f'\nnew_batch.age======{new_batch.non_tensor_batch["age"]}\n')

                        print(f"\nfinished_mask======{finished_mask}\n")

                        staged_out, partial_batch = DataProto.split_patial_rollout(new_batch, finished_mask)
                        staged_out.non_tensor_batch.pop("raw_prompt_ids")
                        staged_out.non_tensor_batch.pop("raw_response_ids")
                        staged_out.non_tensor_batch.pop("raw_response_mask")

                        partial_batch.non_tensor_batch["age"] += 1

                        if len(partial_batch.batch) > 0:
                            for key in ("input_ids", "attention_mask", "position_ids"):
                                tmp = partial_batch.batch.pop(key, None)
                                partial_batch.batch[key] = tmp[:, : self.config.data.max_prompt_length]

                            for key in ("prompts", "responses", "response_mask"):
                                # we don't support rollout_log_probs in this feature branch yet
                                partial_batch.batch.pop(key)

                            partial_batch.non_tensor_batch.pop("__num_turns__")

                        else:
                            partial_batch = None

                        # note that we no longer ensure the order of samples in staged_batch
                        staged_batch = DataProto.concat([staged_batch, staged_out]) if staged_batch is not None else staged_out

                        # prompts whose number of finished rollout is enough can be trained on
                        # while filtering, we ensure sample number is divisible by n_gpus_per_node and as large as possible
                        can_train_mask = np.zeros(len(staged_batch.batch), dtype=bool)
                        id2count = defaultdict(int)
                        required_rollouts = self.config.actor_rollout_ref.rollout.n
                        divisor = self.config.actor_rollout_ref.actor.ppo_mini_batch_size * required_rollouts

                        for uid in staged_batch.non_tensor_batch["uid"]:
                            id2count[uid] += 1

                        complete_uids = [uid for uid, count in id2count.items() if count == required_rollouts]

                        total_complete_samples = len(complete_uids) * required_rollouts
                        max_usable_groups = (total_complete_samples // divisor) * divisor // required_rollouts
                        can_train_count = max_usable_groups * required_rollouts

                        print(f"\n\nCan_train_count === {can_train_count}, max_usable_groups ==== {max_usable_groups}, total_complete_samples == {total_complete_samples}")

                        if can_train_count == 0:
                            print(f"total_complete_samples == {total_complete_samples}, no complete uid groups available. Keep generating...")
                            continue

                        selected_uids = set(complete_uids[:max_usable_groups])

                        for i, uid in enumerate(staged_batch.non_tensor_batch["uid"]):
                            if uid in selected_uids:
                                can_train_mask[i] = True

                        new_batch, staged_batch = DataProto.split_patial_rollout(staged_batch, can_train_mask)
                        staged_batch.non_tensor_batch["age"] += 1

                    if self.config.algorithm.adv_estimator == AdvantageEstimator.REMAX:
                        with marked_timer("gen_max", timing_raw, "purple"):
                            gen_baseline_batch = deepcopy(gen_batch)
                            gen_baseline_batch.meta_info["do_sample"] = False
                            if not self.async_rollout_mode:
                                gen_baseline_output = self.actor_rollout_wg.generate_sequences(gen_baseline_batch)
                            else:
                                gen_baseline_output = self.async_rollout_manager.generate_sequences(gen_baseline_batch)

                            new_batch = new_batch.union(gen_baseline_output)
                            reward_baseline_tensor = self.reward_fn(new_batch)
                            reward_baseline_tensor = reward_baseline_tensor.sum(dim=-1)

                            new_batch.pop(batch_keys=list(gen_baseline_output.batch.keys()))

                            new_batch.batch["reward_baselines"] = reward_baseline_tensor

                            del gen_baseline_batch, gen_baseline_output

                    new_batch.meta_info["use_format_reward"] = self.config.reward_model.use_format_reward
                    new_batch.meta_info["use_code_reward"] = self.config.reward_model.use_code_reward

                    with marked_timer("reward", timing_raw, "yellow"):
                        # compute scores. Support both model and function-based.
                        # We first compute the scores using reward model. Then, we call reward_fn to combine
                        # the results from reward model and rule-based results.
                        if self.use_rm and "rm_scores" not in new_batch.batch.keys():
                            # we first compute reward model score
                            reward_tensor = self.rm_wg.compute_rm_score(new_batch)
                            new_batch = new_batch.union(reward_tensor)

                        reward_tensor, reward_extra_infos_dict = compute_reward(new_batch, self.reward_fn)

                        new_batch.batch["token_level_scores"] = reward_tensor

                        if reward_extra_infos_dict:
                            new_batch.non_tensor_batch.update(
                                {k: np.array(v) for k, v in reward_extra_infos_dict.items()}
                            )
                            print(f" reward_extra_infos_dict.keys() = {list(reward_extra_infos_dict.keys())}")

                        # compute rewards. apply_kl_penalty if available
                        if self.config.algorithm.use_kl_in_reward:
                            new_batch, kl_metrics = apply_kl_penalty(
                                new_batch, kl_ctrl=self.kl_ctrl_in_reward, kl_penalty=self.config.algorithm.kl_penalty
                            )
                            metrics.update(
                                kl_metrics
                            )  # TODO: This will be cleared if we use multiple genenration batches
                        else:
                            new_batch.batch["token_level_rewards"] = new_batch.batch["token_level_scores"]

                    diff_single_prompt = 0
                    if not self.config.algorithm.filter_groups.enable:
                        batch = new_batch
                    else:  # NOTE: When prompts after filtering is less than train batch size,
                        # we skip to the next generation batch
                        metric_name = self.config.algorithm.filter_groups.metric
                        if metric_name == "seq_final_reward":
                            # Turn to numpy for easier filtering
                            new_batch.non_tensor_batch["seq_final_reward"] = (
                                new_batch.batch["token_level_rewards"].sum(dim=-1).numpy()
                            )
                        elif metric_name == "seq_reward":
                            new_batch.non_tensor_batch["seq_reward"] = (
                                new_batch.batch["token_level_scores"].sum(dim=-1).numpy()
                            )

                        # Collect the sequence reward for each trajectory
                        prompt_uid2metric_vals = defaultdict(list)
                        for uid, metric_val in zip(
                            new_batch.non_tensor_batch["uid"], new_batch.non_tensor_batch[metric_name], strict=True
                        ):
                            prompt_uid2metric_vals[uid].append(metric_val)

                        prompt_uid2metric_std = {}
                        for prompt_uid, metric_vals in prompt_uid2metric_vals.items():
                            prompt_uid2metric_std[prompt_uid] = np.std(metric_vals)

                        kept_prompt_uids = []
                        no_kept_uids = []
                        for uid, std in prompt_uid2metric_std.items():
                            if std > 0 or len(prompt_uid2metric_vals[uid]) == 1:
                                uid_scores = prompt_uid2metric_vals[uid]
                                tag = False
                                for u_sc in uid_scores:
                                    if u_sc > self.config.reward_model.true_score:
                                        tag = True
                                        break
                                    else:
                                        continue
                                if tag == True:
                                    kept_prompt_uids.append(uid)
                                else:
                                    no_kept_uids.append(uid)
                            else:
                                no_kept_uids.append(uid)

                        print(f'no_kept_uids nums ==== {len(no_kept_uids)}, no_kept_uids === {no_kept_uids}')

                        num_prompt_in_batch += len(kept_prompt_uids)

                        kept_traj_idxs = []
                        for idx, traj_from_prompt_uid in enumerate(new_batch.non_tensor_batch["uid"]):
                            if traj_from_prompt_uid in kept_prompt_uids:
                                kept_traj_idxs.append(idx)

                        new_batch = new_batch[kept_traj_idxs]
                        batch = new_batch if batch is None else DataProto.concat([batch, new_batch])

                        prompt_bsz = self.config.data.train_batch_size
                        min_prompt_bsz = self.config.data.train_batch_size * self.config.data.min_train_batch_size_ratio

                        print(f"total valid num_prompt_in_batch == {num_prompt_in_batch}. valid kept_prompt_uids=={len(kept_prompt_uids)}, batches need valid num: self.config.data.train_batch_size==={self.config.data.train_batch_size}")
                        if num_prompt_in_batch < prompt_bsz and num_prompt_in_batch < min_prompt_bsz:
                            print(f"{num_prompt_in_batch=} < {prompt_bsz=}")
                            max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
                            if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
                                print(f"{num_gen_batches=}. Keep generating...")
                                progress_bar.update(1)
                                self.gen_steps += 1
                                continue
                            else:
                                raise ValueError(
                                    f"{num_gen_batches=} >= {max_num_gen_batches=}."
                                    + " Generated too many. Please check if your data are too difficult."
                                    + " You could also try set max_num_gen_batches=0 to enable endless trials."
                                )
                        elif num_prompt_in_batch < prompt_bsz and num_prompt_in_batch >= min_prompt_bsz:
                            diff_single_prompt = prompt_bsz - num_prompt_in_batch
                            n_repeate_diff_single_prompt = self.config.actor_rollout_ref.rollout.n * diff_single_prompt
                            complement_batch = self.complement_batch_size(data_batch=batch, complement_count=n_repeate_diff_single_prompt)
                            batch = DataProto.concat([batch, complement_batch])
                            complement_batch_uids = complement_batch.non_tensor_batch["uid"]
                            print(f"diff_single_prompt === {diff_single_prompt}.")
                            num_complement_uid_dict = defaultdict(int)
                            for uid in complement_batch_uids:
                                num_complement_uid_dict[uid] += 1
                            for k,v in num_complement_uid_dict.items():
                                print(f"complement_batch_uid === {k}, select_nums === {v}")

                        else:
                            batch = self.select_data_train_uids(batch, required_rollouts=self.config.actor_rollout_ref.rollout.n, select_nums=self.config.data.train_batch_size,is_random=True)
                            traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
                            batch = batch[:traj_bsz]

                    if "response_mask" not in batch.batch.keys():
                        batch.batch["response_mask"] = compute_response_mask(batch)
                    # Balance the number of valid tokens across DP ranks.
                    # NOTE: This usually changes the order of data in the `batch`,
                    # which won't affect the advantage calculation (since it's based on uid),
                    # but might affect the loss calculation (due to the change of mini-batching).
                    # TODO: Decouple the DP balancing and mini-batching.
                    if self.config.trainer.balance_batch:
                        self._balance_batch(batch, metrics=metrics)

                    # compute global_valid tokens
                    batch.meta_info["global_token_num"] = torch.sum(batch.batch["attention_mask"], dim=-1).tolist()
                    batch.meta_info["use_format_reward"] = self.config.reward_model.use_format_reward
                    batch.meta_info["use_code_reward"] = self.config.reward_model.use_code_reward

                    # recompute old_log_probs
                    with marked_timer("old_log_prob", timing_raw, color="blue"):
                        old_log_prob = self.actor_rollout_wg.compute_log_prob(batch)
                        entropys = old_log_prob.batch["entropys"]
                        response_masks = batch.batch["response_mask"]
                        loss_agg_mode = self.config.actor_rollout_ref.actor.loss_agg_mode
                        entropy_agg = agg_loss(loss_mat=entropys, loss_mask=response_masks, loss_agg_mode=loss_agg_mode)
                        old_log_prob_metrics = {"actor/entropy": entropy_agg.detach().item()}
                        metrics.update(old_log_prob_metrics)
                        old_log_prob.batch.pop("entropys")
                        batch = batch.union(old_log_prob)

                        if "rollout_log_probs" in batch.batch.keys():
                            # TODO: we may want to add diff of probs too.
                            rollout_old_log_probs = batch.batch["rollout_log_probs"]
                            actor_old_log_probs = batch.batch["old_log_probs"]
                            attention_mask = batch.batch["attention_mask"]
                            responses = batch.batch["responses"]
                            response_length = responses.size(1)
                            response_mask = attention_mask[:, -response_length:]

                            rollout_probs = torch.exp(rollout_old_log_probs)
                            actor_probs = torch.exp(actor_old_log_probs)
                            rollout_probs_diff = torch.abs(rollout_probs - actor_probs)
                            rollout_probs_diff = torch.masked_select(rollout_probs_diff, response_mask.bool())
                            rollout_probs_diff_max = torch.max(rollout_probs_diff)
                            rollout_probs_diff_mean = torch.mean(rollout_probs_diff)
                            rollout_probs_diff_std = torch.std(rollout_probs_diff)
                            metrics.update(
                                {
                                    "training/rollout_probs_diff_max": rollout_probs_diff_max.detach().item(),
                                    "training/rollout_probs_diff_mean": rollout_probs_diff_mean.detach().item(),
                                    "training/rollout_probs_diff_std": rollout_probs_diff_std.detach().item(),
                                }
                            )

                    if self.use_reference_policy:

                        with marked_timer("ref", timing_raw, "olive"):
                            if not self.ref_in_actor:
                                ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch)
                            else:
                                ref_log_prob = self.actor_rollout_wg.compute_ref_log_prob(batch)
                            batch = batch.union(ref_log_prob)

                    # compute values
                    if self.use_critic:
                        with marked_timer("values", timing_raw, "cyan"):
                            values = self.critic_wg.compute_values(batch)
                            batch = batch.union(values)

                    with marked_timer("adv", timing_raw, "brown"):
                        # compute advantages, executed on the driver process
                        norm_adv_by_std_in_grpo = self.config.algorithm.get("norm_adv_by_std_in_grpo", True)
                        batch = compute_advantage(
                            batch,
                            adv_estimator=self.config.algorithm.adv_estimator,
                            gamma=self.config.algorithm.gamma,
                            lam=self.config.algorithm.lam,
                            num_repeat=self.config.actor_rollout_ref.rollout.n,
                            norm_adv_by_std_in_grpo=norm_adv_by_std_in_grpo,
                            config=self.config.algorithm,
                        )

                    # update critic
                    if self.use_critic:
                        with marked_timer("update_critic", timing_raw, "pink"):
                            critic_output = self.critic_wg.update_critic(batch)
                        critic_output_metrics = reduce_metrics(critic_output.meta_info["metrics"])
                        metrics.update(critic_output_metrics)

                    # implement critic warmup
                    if self.config.trainer.critic_warmup <= self.global_steps:
                        # update actor
                        with marked_timer("update_actor", timing_raw, color="red"):
                            batch.meta_info["multi_turn"] = self.config.actor_rollout_ref.rollout.multi_turn.enable
                            actor_output = self.actor_rollout_wg.update_actor(batch)
                        actor_output_metrics = reduce_metrics(actor_output.meta_info["metrics"])
                        metrics.update(actor_output_metrics)

                    # Log rollout generations if enabled
                    rollout_data_dir = self.config.trainer.get("rollout_data_dir", None)
                    max_turns_num = self.config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns

                    if rollout_data_dir:
                        with marked_timer("dump_rollout_generations", timing_raw, color="green"):
                            print(batch.batch.keys())
                            inputs = self.tokenizer.batch_decode(batch.batch["prompts"], skip_special_tokens=True)
                            outputs = self.tokenizer.batch_decode(batch.batch["responses"], skip_special_tokens=True)
                            uid_lists = batch.non_tensor_batch["uid"]
                            scores = batch.batch["token_level_scores"].sum(-1).cpu().tolist()
                            if "request_id" in batch.non_tensor_batch:
                                reward_extra_infos_dict.setdefault(
                                    "request_id",
                                    batch.non_tensor_batch["request_id"].tolist(),
                                )
                            self._dump_generations(
                                inputs=inputs,
                                outputs=outputs,
                                scores=scores,
                                reward_extra_infos_dict=reward_extra_infos_dict,
                                dump_path=rollout_data_dir,
                            )
                            metrix_codes_info = statistic_text_code_info(inputs, outputs, uid_lists, scores, max_turns_num)
                            metrics.update(metrix_codes_info)
                            print(f"\ntrain reward scores===={scores}\n")


                    # Check if the ESI (Elastic Server Instance)/training plan is close to expiration.
                    esi_close_to_expiration = should_save_ckpt_esi(
                        max_steps_duration=self.max_steps_duration,
                        redundant_time=self.config.trainer.esi_redundant_time,
                    )
                    # Check if the conditions for saving a checkpoint are met.
                    # The conditions include a mandatory condition (1) and
                    # one of the following optional conditions (2/3/4):
                    # 1. The save frequency is set to a positive value.
                    # 2. It's the last training step.
                    # 3. The current step number is a multiple of the save frequency.
                    # 4. The ESI(Elastic Server Instance)/training plan is close to expiration.
                    if self.config.trainer.save_freq > 0 and (
                        is_last_step
                        or self.global_steps % self.config.trainer.save_freq == 0
                        or esi_close_to_expiration
                    ):
                        if esi_close_to_expiration:
                            print("Force saving checkpoint: ESI instance expiration approaching.")
                        with marked_timer("save_checkpoint", timing_raw, color="green"):
                            self._save_checkpoint()

                    # validate
                    if (
                        self.val_reward_fn is not None
                        and self.config.trainer.test_freq > 0
                        and (is_last_step or self.global_steps % self.config.trainer.test_freq == 0)
                    ):
                        with marked_timer("testing", timing_raw, "green"):

                            val_metrics = self._validate_with_temperatures(temperatures=self.config.actor_rollout_ref.rollout.val_temperature_lists)
                            if is_last_step:
                                last_val_metrics = val_metrics
                        metrics.update(val_metrics)

                with marked_timer("stop_profile", timing_raw):
                    next_step_profile = (
                        self.global_steps + 1 in self.config.global_profiler.steps
                        if self.config.global_profiler.steps is not None
                        else False
                    )
                    self._stop_profiling(
                        curr_step_profile and not next_step_profile
                        if self.config.global_profiler.profile_continuous_steps
                        else curr_step_profile
                    )
                    prev_step_profile = curr_step_profile
                    curr_step_profile = next_step_profile

                steps_duration = timing_raw["step"]
                self.max_steps_duration = max(self.max_steps_duration, steps_duration)
                # training metrics
                metrics.update(
                    {
                        "training/global_step": self.global_steps,
                        "training/epoch": epoch,
                    }
                )
                self.enable_partial_rollout = self.config.algorithm.partial_rollout_max_split > 1
                if self.enable_partial_rollout:
                    metrics.update(
                        {
                            "training/can_train_count": can_train_count,
                            "training/total_complete_samples": total_complete_samples,
                        }
                    )
                # collect metrics
                metrics.update(compute_data_metrics(batch=batch, use_critic=self.use_critic))
                metrics.update(compute_timing_metrics(batch=batch, timing_raw=timing_raw))
                # TODO: implement actual tflpo and theoretical tflpo
                n_gpus = self.resource_pool_manager.get_n_gpus()
                metrics.update(compute_throughout_metrics(batch=batch, timing_raw=timing_raw, n_gpus=n_gpus))
                # this is experimental and may be changed/removed in the future in favor of a general-purpose one
                if isinstance(self.train_dataloader.sampler, AbstractCurriculumSampler):
                    self.train_dataloader.sampler.update(batch=batch)

                timing_raw = defaultdict(float)  # clear timing

                metrics["train/num_gen_batches"] = num_gen_batches
                metrics["train/diff_single_prompt_count"] = diff_single_prompt
                batch = None
                num_prompt_in_batch = 0
                num_gen_batches = 0

                # TODO: make a canonical logger that supports various backend
                logger.log(data=metrics, step=self.global_steps)
                progress_bar.update(1)
                self.global_steps += 1
                self.gen_steps += 1
                if (
                    hasattr(self.config.actor_rollout_ref.actor, "profiler")
                    and self.config.actor_rollout_ref.actor.profiler.tool == "torch_memory"
                ):
                    self.actor_rollout_wg.dump_memory_snapshot(
                        tag=f"post_update_step{self.global_steps}", sub_dir=f"step{self.global_steps}"
                    )
                if is_last_step:
                    pprint(f"Final validation metrics: {last_val_metrics}")
                    progress_bar.close()
                    return


                # this is experimental and may be changed/removed in the future
                # in favor of a general-purpose data buffer pool
                if hasattr(self.train_dataset, "on_batch_end"):
                    # The dataset may be changed after each training batch
                    self.train_dataset.on_batch_end(batch=batch)
