# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

"""Megatron optimizer."""

import copy
import math
import warnings
from abc import ABC, abstractmethod
from itertools import chain
from logging import getLogger
from typing import Any, Callable, Dict, List, Optional, Tuple, Union

import torch

try:
    from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_scale

    multi_tensor_scale_impl = multi_tensor_scale
except ImportError:
    try:
        import amp_C
        from apex.multi_tensor_apply import multi_tensor_applier

        multi_tensor_scale_impl = amp_C.multi_tensor_scale
    except ImportError:

        warnings.warn(
            'Transformer Engine and Apex are not installed. '
            'Falling back to local implementations of '
            'multi_tensor_applier and multi_tensor_scale'
        )

        from megatron.core.utils import local_multi_tensor_applier, local_multi_tensor_scale

        multi_tensor_applier = local_multi_tensor_applier
        multi_tensor_scale_impl = local_multi_tensor_scale

from .. import parallel_state, tensor_parallel
from ..config_logger import has_config_logger_enabled, log_config_to_disk
from ..dist_checkpointing.mapping import ShardedStateDict
from ..dist_checkpointing.optimizer import (
    get_param_id_to_sharded_param_map,
    make_sharded_optimizer_tensor,
    optim_state_to_sharding_state,
)
from ..dist_checkpointing.utils import add_prefix_for_sharding
from ..transformer.module import param_is_not_shared
from ..utils import log_single_rank
from .clip_grads import clip_grad_by_total_norm_fp32, count_zeros_fp32, get_grad_norm_fp32
from .grad_scaler import MegatronGradScaler
from .optimizer_config import OptimizerConfig

logger = getLogger(__name__)


def _zero_grad_group_helper(
    group: List[torch.nn.Parameter], set_to_none: bool, use_decoupled_grad: bool = False
):
    """
    Zero out the gradient for a group of parameters.
    Note: copied from torch.optim.optimizer.
    """
    for param in group:
        grad_attr = "decoupled_grad" if use_decoupled_grad else "grad"
        if hasattr(param, grad_attr) and getattr(param, grad_attr) is not None:
            if set_to_none:
                setattr(param, grad_attr, None)
            else:
                grad_obj = getattr(param, grad_attr)
                if grad_obj.grad_fn is not None:
                    grad_obj.detach_()
                else:
                    grad_obj.requires_grad_(False)
                grad_obj.zero_()


def _multi_tensor_copy_this_to_that(
    this: List[torch.Tensor], that: List[torch.Tensor], overflow_buf: Optional[torch.Tensor] = None
):
    """
    Use multi-tensor-applier to copy values from one list to another.
    We don't have a bfloat16 implementation so for now if the overflow_buf
    is not provided, we default back to simple loop copy to be compatible
    with bfloat16.
    """
    if overflow_buf is not None:
        overflow_buf.fill_(0)
        # Scaling with factor `1.0` is equivalent to copy.
        multi_tensor_applier(multi_tensor_scale_impl, overflow_buf, [this, that], 1.0)
    else:
        for this_, that_ in zip(this, that):
            that_.copy_(this_)


param_group_identifier_keys = ('wd_mult', 'lr_mult', 'is_expert_parallel', 'is_decoupled_lr')


class MegatronOptimizer(ABC):
    """
    Base class for all Megatron optimizers.

    Args:
        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
        config (OptimizerConfig): configuration object for optimizer.
        init_state_fn (Callable, optional): function to initialize state in the optimizer.
    """

    def __init__(
        self,
        optimizer: torch.optim.Optimizer,
        config: OptimizerConfig,
        init_state_fn: Callable = lambda x: None,
    ):
        """Input optimizer is the base optimizer (e.g., Adam)."""
        self.optimizer = optimizer
        if self.optimizer is None:
            warnings.warn(
                f"WARNING: there is no optimizer on RANK {torch.distributed.get_rank()}. "
                "This may be expected if you have frozen sub-models."
            )
        self.config = config
        self.init_state_fn = init_state_fn

    def get_parameters(self) -> List[torch.nn.Parameter]:
        """
        Get list of parameters wrapped in optimizer.
        """
        params = []
        if hasattr(self.optimizer, 'param_groups'):
            for param_group in self.optimizer.param_groups:
                for param in param_group['params']:
                    params.append(param)
        return params

    def get_main_grads_for_grad_norm(self) -> List[torch.Tensor]:
        """
        Get main_grads that should be taken into account to compute the grad norm.
        Filter parameters based on:
          - grad should not be None.
          - parameter should not be shared (i.e., grads shouldn't be double counted while
            computing norms).
          - should not be a replica due to tensor model parallelism.
        """
        params = self.get_parameters()
        grads_for_norm = []
        for param in params:
            if getattr(param, "__fsdp_param__", False):
                grad = param.grad._local_tensor if param.grad is not None else None
            elif self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8:
                grad = param.decoupled_grad if hasattr(param, "decoupled_grad") else None
            else:
                grad = param.grad
            grad_not_none = grad is not None
            is_not_shared = param_is_not_shared(param)
            is_not_tp_duplicate = tensor_parallel.param_is_not_tensor_parallel_duplicate(param)
            if grad_not_none and is_not_shared and is_not_tp_duplicate:
                grads_for_norm.append(grad)

        return grads_for_norm

    def get_grad_stats_parallel_group(self) -> torch.distributed.ProcessGroup:
        """Process group for reducing gradient statistics (num_zeros & norm).

        The two most common cases are:
        - Non-distributed optimizer (default): Return the model-parallel group.
        - Distributed optimizer (overridden in distrib_optimizer.py): Return the entire world.
        """
        if hasattr(self, 'model_parallel_group'):
            warnings.warn(
                "WARNING: `optimizer.model_parallel_group` deprecated and renamed to "
                "`optimizer.grad_stats_parallel_group`. The previous name will be "
                "removed in a future release."
            )
            self.grad_stats_parallel_group = self.model_parallel_group
            delattr(self, "model_parallel_group")
            return self.grad_stats_parallel_group
        if hasattr(self, 'grad_stats_parallel_group'):
            return self.grad_stats_parallel_group
        return parallel_state.get_model_parallel_group()

    @abstractmethod
    def prepare_grads(self) -> bool:
        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
        return False

    @abstractmethod
    def step_with_ready_grads(self) -> bool:
        """Step the optimizer with ready gradients, return successful."""
        return True

    @torch.no_grad()
    def get_grad_norm(self):
        """Compute and return grad norm."""
        grads_for_norm = self.get_main_grads_for_grad_norm()
        total_norm = get_grad_norm_fp32(
            grads_for_norm, grad_stats_parallel_group=self.get_grad_stats_parallel_group()
        )
        return total_norm

    def clip_grad_norm(self, clip_grad: float) -> float:
        """Compute and return grad norm, also clip grads."""
        params = self.get_parameters()
        if params:
            grads_for_norm = self.get_main_grads_for_grad_norm()
        else:
            grads_for_norm = []
        grad_norm = get_grad_norm_fp32(
            grads_for_norm, grad_stats_parallel_group=self.get_grad_stats_parallel_group()
        )

        if params:
            clip_grad_by_total_norm_fp32(
                params,
                clip_grad,
                grad_norm,
                self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8,
            )
        return grad_norm

    def count_zeros(self) -> float:
        """Count number of zeros in model's gradients."""
        params = self.get_parameters()
        return count_zeros_fp32(
            params,
            grad_stats_parallel_group=self.get_grad_stats_parallel_group(),
            use_decoupled_grad=self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8,
        )

    @abstractmethod
    def zero_grad(self, set_to_none: bool = True):
        """Zero gradients and prepare for next forward pass."""
        pass

    @abstractmethod
    def get_loss_scale(self) -> torch.Tensor:
        """
        Get current loss scale factor.
        NOTE: The output should be a CUDA tensor of size 1.
        """
        pass

    def scale_loss(self, loss: torch.Tensor) -> torch.Tensor:
        """Simple scaling."""
        return self.get_loss_scale() * loss

    @abstractmethod
    def reload_model_params(self, state_dict=None):
        """Refreshes any internal state from the current model parameters.
        Call whenever the parameters are changed outside of the optimizer.
        For example, when we load a model from a checkpoint  without loading
        the optimizer, the model parameters are updated but for fp16 optimizer
        with main parameters, the main parameters need to also be updated.

        Args:
            state_dict (dict, optional): When it is not None, we use the params
                from the input state_dict to initialize the main params, instead
                of using the model params for initialization. This is useful when
                the precision of the model params is lower than that of the params
                from the state dict, as it allows the main params to be more accurate.
        """
        pass

    @abstractmethod
    def state_dict(self):
        """Return state_dict."""
        pass

    @abstractmethod
    def load_state_dict(self, state_dict):
        """Load pass-in `state_dict`."""
        pass

    # Promote state so it can be retrieved or set via
    # "optimizer_instance.state"
    def _get_state(self):
        return self.optimizer.state

    def _set_state(self, value):
        self.optimizer.state = value

    state = property(_get_state, _set_state)

    # Promote param_groups so it can be retrieved or set via
    # "optimizer_instance.param_groups"
    # (for example, to adjust the learning rate)
    def _get_param_groups(self):
        if self.is_stub_optimizer:
            return []
        else:
            return self.optimizer.param_groups

    def _set_param_groups(self, value):
        self.optimizer.param_groups = value

    param_groups = property(_get_param_groups, _set_param_groups)

    @abstractmethod
    def step(self):
        """Step the optimizer."""
        pass

    @abstractmethod
    def sharded_state_dict(
        self,
        model_sharded_state_dict: ShardedStateDict,
        is_loading: bool = False,
        metadata: Optional[dict] = None,
    ) -> ShardedStateDict:
        """Builds sharded state dict for the optimizer, based on model's sharded state dict.

        Args:
            model_sharded_state_dict (ShardedStateDict): sharded state dict of the model
            is_loading (bool, optional): flag indicating whether the state dict will be
                used to save or load the optimizer state. Defaults to False.
            metadata (dict, optional): metadata controlling the sharded_state_dict logic.

        Returns: optimizer sharded state dict
        """

    @staticmethod
    def _extract_common_per_param_step(state_dict) -> Union[int, torch.Tensor, None]:
        common_step = None
        for param_idx, param_state in state_dict['state'].items():
            param_step = param_state.get('step', None)
            if param_step is not None:
                if common_step is None:
                    common_step = param_step
                elif common_step != param_step:
                    raise ValueError(
                        "The optimizer step differs per parameter. Mcore only supports "
                        "optimizers whose step is shared across all parameters."
                    )
        return common_step

    @staticmethod
    def _restore_common_per_param_step(state_dict: Dict, step: Union[int, torch.Tensor]):
        for param_idx, param_state in state_dict['state'].items():
            param_state['step'] = copy.deepcopy(step)

    def offload_to_cpu(self):
        """Function used for RL training.
        Move optimizer state tensors to CPU to free GPU memory during inference."""
        if getattr(self, 'optimizer', None) is not None and not getattr(
            self, 'is_stub_optimizer', False
        ):
            log_single_rank(logger, logging.INFO, '[OFFLOAD] moving optimizer state to CPU')
            # Move all optimizer tensors to CPU while keeping the optimizer instance
            for param_group in self.optimizer.param_groups:
                for p in param_group['params']:
                    if isinstance(p, torch.Tensor) and p.is_cuda:
                        p.data = p.data.cpu()

            for state_dict in self.optimizer.state.values():
                for k, v in state_dict.items():
                    if isinstance(v, torch.Tensor) and v.is_cuda:
                        state_dict[k] = v.cpu()

            torch.cuda.empty_cache()

    def restore_from_cpu(self):
        """Function used for RL training.
        Restore optimizer state tensors from CPU back to GPU for training."""
        if getattr(self, 'optimizer', None) is not None and not getattr(
            self, 'is_stub_optimizer', False
        ):
            log_single_rank(logger, logging.INFO, '[RESTORE] moving optimizer state back to GPU')
            # Move all optimizer tensors back to GPU
            for param_group in self.optimizer.param_groups:
                for p in param_group['params']:
                    if isinstance(p, torch.Tensor) and not p.is_cuda:
                        p.data = p.data.cuda()

            for state_dict in self.optimizer.state.values():
                for k, v in state_dict.items():
                    if isinstance(v, torch.Tensor) and not v.is_cuda:
                        state_dict[k] = v.cuda()

    @staticmethod
    def _filter_and_reorder_param_groups(
        current_groups: List[Dict], state_dict_groups: List[Dict]
    ) -> List[Dict]:
        """Filter and reorder state_dict parameter groups to match current optimizer groups.
        Keys used for matching align with those from _get_param_groups:
        (wd_mult, lr_mult, is_expert_parallel, is_decoupled_lr)

        Args:
            current_groups (List[Dict]): Parameter groups from the current optimizer instance.
            state_dict_groups (List[Dict]): Parameter groups loaded from a state dict.

        Returns:
            List[Dict]: Filtered and reordered parameter groups matching the current optimizer.

        Raises:
            ValueError: If parameter groups in state dict don't match current optimizer.
        """
        # Define groups order that is needed in the current optimizer (coming from runtime)
        needed_groups = [
            # NeMo may have different key for required fields, e.g., "wd_mult" to "pre_wd_mult"
            tuple(g[key] if key in g else g[f"pre_{key}"] for key in param_group_identifier_keys)
            for g in current_groups
        ]

        # Keep state_dict param group order since groups are LocalNonpersistentObject
        # and their order is determined at runtime, not from the checkpoint.
        params_in_state_dict_order = [g['params'] for g in state_dict_groups]
        loaded_groups_map = {
            tuple(
                # NeMo may have different key for required fields, e.g., "wd_mult" to "pre_wd_mult"
                group[key] if key in group else group[f"pre_{key}"]
                for key in param_group_identifier_keys
            ): group
            for group in state_dict_groups
        }

        final_groups = []
        for key, params in zip(needed_groups, params_in_state_dict_order):
            if key not in loaded_groups_map:
                available_keys = '\n'.join(str(k) for k in loaded_groups_map.keys())
                raise ValueError(
                    f"Could not find parameter group with key {key} in loaded checkpoint.\n"
                    f"Available keys:\n{available_keys}\n"
                    f"Parameter group key definition: {param_group_identifier_keys}"
                )

            # Update group's parameters to preserve state dict ordering
            group = loaded_groups_map[key]
            group['params'] = params
            final_groups.append(group)

        return final_groups


class MixedPrecisionOptimizer(MegatronOptimizer):
    """Base class for both the float-16 and the distributed optimizer.

    Args:
        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
        config (OptimizerConfig): configuration object for optimizer.
        grad_scaler (MegatronGradScaler): used for scaling gradients. Note that
            this can be None. This case happens when `bf16 = True` and we don't
            use any loss scale. Note that for `bf16 = True`, we can have
            a constant gradient scaler. Also for `bf16 = False`, we
            always require a grad scaler.
        init_state_fn (Callable, optional): function to initialize state in the optimizer.
    """

    def __init__(
        self,
        optimizer: torch.optim.Optimizer,
        config: OptimizerConfig,
        grad_scaler: Optional[MegatronGradScaler],
        init_state_fn: Callable,
    ):
        if has_config_logger_enabled(config):
            log_config_to_disk(config, locals(), prefix=type(self).__name__)

        super().__init__(optimizer, config, init_state_fn)
        self.grad_scaler = grad_scaler

        # None grad scaler is only supported for bf16.
        if self.grad_scaler is None:
            assert not self.config.fp16, 'fp16 expects a grad scaler.'

        # Tensor used to determine if a nan/if has happend.
        # Any non-zero value indicates inf/nan.
        # Note that we keep this for the cases that grad scaler is none.
        # We still record nan/inf if we have a bfloat16 with a grad scaler.
        if self.grad_scaler:
            self.found_inf = torch.tensor([0.0], dtype=torch.float, device='cuda')

        # Dummy tensor needed for apex multi-apply tensor.
        # For bfloat, we don't have multi-tensor apply and for now
        # we set it to none so the multi-tensor apply gets ignored.
        if self.config.bf16:
            self._dummy_overflow_buf = None
        else:
            self._dummy_overflow_buf = torch.tensor([0], dtype=torch.int, device='cuda')

        # In case grad scaler is not passed, define the unity scale.
        if self.grad_scaler is None:
            self._scale_one = torch.tensor([1.0], dtype=torch.float, device='cuda')

    def get_loss_scale(self):
        if self.grad_scaler is None:
            return self._scale_one
        return self.grad_scaler.scale

    def reload_model_params(self, state_dict=None):
        if self.param_groups:
            self._copy_model_params_to_main_params(state_dict=state_dict)

    def _unscale_main_grads_and_check_for_nan(self):

        # Collect main grads.
        if not self.is_stub_optimizer:
            main_grads = self._collect_main_grad_data_for_unscaling()

        # Reset found inf.
        self.found_inf.fill_(0.0)

        if not self.is_stub_optimizer:
            # Unscale and set found inf/nan
            torch._amp_foreach_non_finite_check_and_unscale_(
                main_grads, self.found_inf, self.grad_scaler.inv_scale
            )

        # Update across all model parallel instances.
        torch.distributed.all_reduce(
            self.found_inf,
            op=torch.distributed.ReduceOp.MAX,
            group=self.get_grad_stats_parallel_group(),
        )

        # Check for nan.
        found_inf_flag = self.found_inf.item() > 0

        return found_inf_flag

    @torch.no_grad()
    def prepare_grads(self) -> bool:
        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
        timers = self.config.timers

        # Copy gradients from model params to main params.
        if timers is not None:
            timers('optimizer-copy-to-main-grad', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        if not self.is_stub_optimizer:
            self._copy_model_grads_to_main_grads()
        if timers is not None:
            timers('optimizer-copy-to-main-grad').stop()

        # Do unscale, check for inf, and update grad scaler only for
        # the case that grad scaler is provided.
        if self.grad_scaler:

            # Unscale and check for inf/nan.
            if timers is not None:
                timers('optimizer-unscale-and-check-inf', log_level=1).start(
                    barrier=self.config.barrier_with_L1_time
                )
            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
            if timers is not None:
                timers('optimizer-unscale-and-check-inf').stop()

            # We are done with scaling gradients
            # so we can update the loss scale.
            self.grad_scaler.update(found_inf_flag)

            return found_inf_flag

        return False

    @torch.no_grad()
    def step_with_ready_grads(self) -> bool:
        """Step the optimizer with ready gradients, return successful."""
        timers = self.config.timers
        # Step the optimizer.
        if timers is not None:
            timers('optimizer-inner-step', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        if not self.is_stub_optimizer:
            self.optimizer.step()
        if timers is not None:
            timers('optimizer-inner-step').stop()

        # Update params from main params.
        if timers is not None:
            timers('optimizer-copy-main-to-model-params', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        if not self.is_stub_optimizer:
            if self.config.reuse_grad_buf_for_mxfp8_param_ag:
                # In the case of overlap_param_gather,
                # copy is manually called in the training loop
                if not self.config.overlap_param_gather:
                    self._copy_main_params_to_param_buffer()
            else:
                self._copy_main_params_to_model_params()

        if timers is not None:
            timers('optimizer-copy-main-to-model-params').stop()

        return True

    @torch.no_grad()
    def step(self):
        timers = self.config.timers

        found_inf_flag = self.prepare_grads()
        if found_inf_flag:
            return False, None, None

        # Clip the main gradients.
        if timers is not None:
            timers('optimizer-clip-main-grad', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        grad_norm = 0.0
        if self.config.clip_grad > 0.0:
            grad_norm = self.clip_grad_norm(self.config.clip_grad)
        if timers is not None:
            timers('optimizer-clip-main-grad').stop()

        # Count the zeros in the grads.
        if timers is not None:
            timers('optimizer-count-zeros', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else 0
        if timers is not None:
            timers('optimizer-count-zeros').stop()

        success = self.step_with_ready_grads()

        # Successful update.
        return success, grad_norm, num_zeros_in_grad


class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
    """Float16 optimizer for fp16 and bf16 data types.

    Args:
        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
        config (OptimizerConfig): configuration object for optimizer.
        grad_scaler (MegatronGradScaler): used for scaling gradients. Note that
            this can be None. This case happens when `bf16 = True` and we don't
            use any loss scale. Note that for `bf16 = True`, we can have
            a constant gradient scaler. Also for `bf16 = False`, we
            always require a grad scaler.
        init_state_fn (Callable, optional): function to initialize state in the optimizer.
    """

    def __init__(
        self,
        optimizer: torch.optim.Optimizer,
        config: OptimizerConfig,
        grad_scaler: MegatronGradScaler,
        init_state_fn: Callable,
    ):

        super().__init__(optimizer, config, grad_scaler, init_state_fn)

        # Handle main parameters.

        if optimizer:
            # Three groups of parameters:
            #   float16_groups: original float16 parameters
            #   fp32_from_float16_groups: fp32 copy of float16 parameters
            #   fp32_from_fp32_groups: original fp32 parameters
            self.float16_groups = []
            self.fp32_from_float16_groups = []
            self.fp32_from_fp32_groups = []

            # For all the groups in the original optimizer:
            for param_group in self.optimizer.param_groups:
                float16_params_this_group = []
                fp32_params_this_group = []
                fp32_from_float16_params_this_group = []
                # For all the parameters in this group:
                for i, param in enumerate(param_group['params']):
                    if param.requires_grad:

                        # float16 params:
                        if param.type() in ['torch.cuda.HalfTensor', 'torch.cuda.BFloat16Tensor']:
                            float16_params_this_group.append(param)
                            # Create a copy
                            main_param = param.detach().clone().float()
                            # Copy tensor model parallel attributes.
                            tensor_parallel.copy_tensor_model_parallel_attributes(main_param, param)
                            if hasattr(param, 'shared'):
                                main_param.shared = param.shared
                            # Replace the optimizer params with the new fp32 copy.
                            param_group['params'][i] = main_param

                            # Store handle to main_param.
                            param.main_param = main_param

                            fp32_from_float16_params_this_group.append(main_param)
                            # Reset existing state dict key to the new main param.
                            if param in self.optimizer.state:
                                self.optimizer.state[main_param] = self.optimizer.state.pop(param)
                        # fp32 params.
                        elif param.type() == 'torch.cuda.FloatTensor':
                            fp32_params_this_group.append(param)
                            param_group['params'][i] = param

                        else:
                            raise TypeError(
                                'Wrapped parameters must be one of '
                                'torch.cuda.FloatTensor,  '
                                'torch.cuda.HalfTensor, or '
                                'torch.cuda.BFloat16Tensor. '
                                'Received {}'.format(param.type())
                            )

                self.float16_groups.append(float16_params_this_group)
                self.fp32_from_float16_groups.append(fp32_from_float16_params_this_group)
                self.fp32_from_fp32_groups.append(fp32_params_this_group)
            self.is_stub_optimizer = False
        else:
            self.is_stub_optimizer = True

    def zero_grad(self, set_to_none=True):
        """We only need to zero the model related parameters, i.e.,
        float16_groups & fp32_from_fp32_groups. We additionally zero
        fp32_from_float16_groups as a memory optimization to reduce
        fragmentation; in the case of set_to_none==True, the space
        used by this field can be safely deallocated at this point."""
        if self.is_stub_optimizer:
            return
        for group in self.float16_groups:
            _zero_grad_group_helper(group, set_to_none)
        for group in self.fp32_from_float16_groups:
            _zero_grad_group_helper(group, set_to_none)
        for group in self.fp32_from_fp32_groups:
            _zero_grad_group_helper(group, set_to_none)

    def _collect_main_grad_data_for_unscaling(self):
        if self.is_stub_optimizer:
            return

        main_grads = []

        # fp32 params from float16 ones.
        for main_group in self.fp32_from_float16_groups:
            for main_param in main_group:
                if main_param.grad is not None:
                    main_grads.append(main_param.grad.data)

        # Append fp32 parameters.
        for main_group in self.fp32_from_fp32_groups:
            for main_param in main_group:
                if main_param.grad is not None:
                    main_grads.append(main_param.grad.data)

        return main_grads

    def _get_model_and_main_params_data_float16(self):
        model_data = []
        main_data = []
        for model_group, main_group in zip(self.float16_groups, self.fp32_from_float16_groups):
            for model_param, main_param in zip(model_group, main_group):
                model_data.append(model_param.data)
                main_data.append(main_param.data)
        return model_data, main_data

    def _copy_model_grads_to_main_grads(self):
        # This only needs to be done for the float16 group.
        for model_group, main_group in zip(self.float16_groups, self.fp32_from_float16_groups):
            for model_param, main_param in zip(model_group, main_group):
                if hasattr(model_param, 'main_grad'):
                    main_param.grad = model_param.main_grad.float()
                else:
                    if model_param.grad is not None:
                        main_param.grad = model_param.grad.float()

                # Safe to deallocate model's grad/main_grad after copying.
                # (If using contiguous buffers, main_grad's memory should
                # persist and therefore should not be deallocated.)
                model_param.grad = None

        # For fp32 grads, we need to reset the grads to main grad.
        for model_group in self.fp32_from_fp32_groups:
            for model_param in model_group:
                model_param.grad = model_param.main_grad

    def _copy_main_params_to_model_params(self):
        # Only needed for the float16 params.
        model_data, main_data = self._get_model_and_main_params_data_float16()
        _multi_tensor_copy_this_to_that(
            this=main_data, that=model_data, overflow_buf=self._dummy_overflow_buf
        )

    def _copy_model_params_to_main_params(self, state_dict=None):
        assert state_dict is None, "Initialize main params from state dict is not supported"
        # Only needed for the float16 params.
        model_data, main_data = self._get_model_and_main_params_data_float16()
        _multi_tensor_copy_this_to_that(
            this=model_data, that=main_data, overflow_buf=self._dummy_overflow_buf
        )

    def state_dict(self, is_loading: bool = False):
        if is_loading:
            self.init_state_fn(self.optimizer, self.config)

        state_dict = {}
        state_dict['optimizer'] = self.optimizer.state_dict()
        if self.grad_scaler:
            state_dict['grad_scaler'] = self.grad_scaler.state_dict()
        state_dict['fp32_from_fp16_params'] = self.fp32_from_float16_groups
        return state_dict

    def sharded_state_dict(
        self,
        model_sharded_state_dict: ShardedStateDict,
        is_loading: bool = False,
        metadata: Optional[dict] = None,
    ):

        if is_loading:
            self.init_state_fn(self.optimizer, self.config)

        state_dict = self.state_dict()

        id_to_sharded_param_map = get_param_id_to_sharded_param_map(
            model_sharded_state_dict, chain.from_iterable(g for g in self.float16_groups)
        )

        # Convert fp32_from_fp16_params
        assert len(state_dict['fp32_from_fp16_params']) == len(
            state_dict['optimizer']['param_groups']
        )
        state_dict['fp32_from_fp16_params'] = [
            [
                make_sharded_optimizer_tensor(
                    id_to_sharded_param_map[param_id],
                    fp32_param,
                    prefix=f'optimizer.state.fp32_param',
                )
                for param_id, fp32_param in zip(state_group['params'], fp32_group)
            ]
            for fp32_group, state_group in zip(
                state_dict['fp32_from_fp16_params'], state_dict['optimizer']['param_groups']
            )
        ]

        step = self._extract_common_per_param_step(state_dict['optimizer'])

        # Convert regular optimizer state
        # all optimizer parameters passed to optim_state_to_sharding_state are
        # expected to have the same shape as the model parameters,
        # so we save the step separately and ignore it here
        optim_state_to_sharding_state(
            state_dict['optimizer'], id_to_sharded_param_map, exclude_keys="step"
        )
        # save step as a shared step among all parameters. Separate per-parameter
        # steps are not supported
        if step:
            state_dict['optimizer']['state']['common_step'] = step
        return state_dict

    def load_state_dict(self, state_dict):
        # Optimizer.
        optimizer_key = 'optimizer'
        if optimizer_key not in state_dict:
            optimizer_key = 'optimizer_state_dict'
            logger.info('***WARNING*** loading optimizer from an old checkpoint ...')
        if 'common_step' in state_dict[optimizer_key]['state']:
            common_step = state_dict[optimizer_key]['state'].pop('common_step')
            self._restore_common_per_param_step(state_dict[optimizer_key], common_step)

        # Filter and reorder param groups to match current optimizer
        state_dict[optimizer_key]['param_groups'] = self._filter_and_reorder_param_groups(
            self.optimizer.param_groups, state_dict[optimizer_key]['param_groups']
        )
        self.optimizer.load_state_dict(state_dict[optimizer_key])

        # Grad scaler.
        if 'grad_scaler' not in state_dict:
            if self.config.fp16:
                logger.info('***WARNING*** found an old checkpoint, will not load grad scaler ...')
        else:
            if self.grad_scaler:
                self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
            else:
                logger.info(
                    '***WARNING*** fould the grad scaler in the '
                    'checkpoint but it is None in the class. '
                    'Skipping loading grad scaler ...'
                )

        # Copy data for the main params.
        fp32_from_float16_params_key = 'fp32_from_fp16_params'
        if fp32_from_float16_params_key not in state_dict:
            fp32_from_float16_params_key = 'fp32_from_fp16'
        for current_group, saved_group in zip(
            self.fp32_from_float16_groups, state_dict[fp32_from_float16_params_key]
        ):
            for current_param, saved_param in zip(current_group, saved_group):
                current_param.data.copy_(saved_param.data)


class FP32Optimizer(MegatronOptimizer):
    """Float32 optimizer.

    Args:
        optimizer (torch.optim.Optimizer): base optimizer such as Adam or SGD.
        config (OptimizerConfig): configuration object for optimizer.
        init_state_fn (Callable, optional): function to initialize state in the optimizer.
    """

    def __init__(
        self, optimizer: torch.optim.Optimizer, config: OptimizerConfig, init_state_fn: Callable
    ):
        if has_config_logger_enabled(config):
            log_config_to_disk(config, locals(), prefix=type(self).__name__)

        super(FP32Optimizer, self).__init__(optimizer, config, init_state_fn)

        self._scale = torch.tensor([1.0], dtype=torch.float, device='cuda')
        self.is_stub_optimizer = True if optimizer is None else False

    def zero_grad(self, set_to_none=True):
        """Copied from torch.optim.optimizer"""
        if self.is_stub_optimizer:
            return
        for group in self.optimizer.param_groups:
            _zero_grad_group_helper(group['params'], set_to_none)

    def get_loss_scale(self):
        """FP32 optimizer does not do any scaling."""
        return self._scale

    @torch.no_grad()
    def prepare_grads(self) -> bool:
        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
        if self.is_stub_optimizer:
            return False
        timers = self.config.timers

        # Copy main_grads to grads.
        if timers is not None:
            timers('optimizer-copy-to-main-grad', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        for param_group in self.optimizer.param_groups:
            for param in param_group['params']:
                if hasattr(param, 'main_grad'):
                    param.grad = param.main_grad
        if timers is not None:
            timers('optimizer-copy-to-main-grad').stop()

        return False

    @torch.no_grad()
    def step_with_ready_grads(self) -> bool:
        """Step the optimizer with ready gradients, return successful."""
        if self.is_stub_optimizer:
            return True
        timers = self.config.timers

        # Update parameters.
        if timers is not None:
            timers('optimizer-inner-step', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        self.optimizer.step()
        if timers is not None:
            timers('optimizer-inner-step').stop()

        return True

    @torch.no_grad()
    def step(self):
        """Clip gradients (if needed) and step the base optimizer.
        Always return successful since there is no overflow."""
        timers = self.config.timers

        found_inf_flag = self.prepare_grads()
        if found_inf_flag:
            return False, None, None

        # Clip gradients.
        if timers is not None:
            timers('optimizer-clip-main-grad', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        grad_norm = None
        if self.config.clip_grad > 0.0:
            grad_norm = self.clip_grad_norm(self.config.clip_grad)
        if timers is not None:
            timers('optimizer-clip-main-grad').stop()

        # Count the zeros in the grads.
        if timers is not None:
            timers('optimizer-count-zeros', log_level=1).start(
                barrier=self.config.barrier_with_L1_time
            )
        num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else None
        if timers is not None:
            timers('optimizer-count-zeros').stop()

        success = self.step_with_ready_grads()

        # No overflow for FP32 optimizer.
        return success, grad_norm, num_zeros_in_grad

    def reload_model_params(self, state_dict=None):
        pass

    def state_dict(self):
        return self.optimizer.state_dict()

    def load_state_dict(self, state_dict):
        if 'common_step' in state_dict['state']:
            common_step = state_dict['state'].pop('common_step')
            self._restore_common_per_param_step(state_dict, common_step)

        # Filter and reorder param groups to match current optimizer
        state_dict['param_groups'] = self._filter_and_reorder_param_groups(
            self.optimizer.param_groups, state_dict['param_groups']
        )
        self.optimizer.load_state_dict(state_dict)

    def sharded_state_dict(
        self,
        model_sharded_state_dict: ShardedStateDict,
        is_loading: bool = False,
        metadata: Optional[dict] = None,
    ):
        if is_loading:
            self.init_state_fn(self.optimizer, self.config)

        state_dict = self.state_dict()
        id_to_sharded_param_map = get_param_id_to_sharded_param_map(
            model_sharded_state_dict, self.get_parameters()
        )
        step = self._extract_common_per_param_step(state_dict)

        # all optimizer parameters passed to optim_state_to_sharding_state are
        # expected to have the same shape as the model parameters,
        # so we save the step separately and ignore it here
        optim_state_to_sharding_state(state_dict, id_to_sharded_param_map, exclude_keys="step")
        # save step as a shared step among all parameters. Separate per-parameter
        # steps are not supported
        if step:
            state_dict['state']['common_step'] = step
        return state_dict


class ProxyDict:
    """
    A dictionary-like object that proxies to a list of dictionaries.

    e.g., ProxyDict([{'a': 1}, {'b': 2}]) behaves like:
    {
        (0, 'a'): 1,
        (1, 'b'): 2,
    }
    We use tuples as keys to avoid ambiguity with the keys of the inner dicts.
    """

    def __init__(self, inner_dicts: List[dict]):
        self._inner_dicts = inner_dicts

    def __getitem__(self, key: Tuple[int, str]):
        idx, inner_key = key
        return self._inner_dicts[idx].get(inner_key)

    def __setitem__(self, key: Tuple[int, str], value: Any):
        idx, inner_key = key
        self._inner_dicts[idx][inner_key] = value

    def __len__(self) -> int:
        return sum([len(inner_dict) for inner_dict in self._inner_dicts])

    def __iter__(self):
        for idx, inner_dict in enumerate(self._inner_dicts):
            for inner_key in inner_dict:
                yield (idx, inner_key)

    def items(self):
        """Return generator over underlying items."""
        for idx, inner_dict in enumerate(self._inner_dicts):
            for inner_key, value in inner_dict.items():
                yield (idx, inner_key), value


class ChainedOptimizer(MegatronOptimizer):
    """ChainedOptimizer is designed for a collection of optimizers.

    These optimizers are responsible for different parts of multiple models for
    a training task and will be executed one-by-one when the model is updated.

    Args:
        chained_optimizers: a list of optimizers.
    """

    def __init__(self, chained_optimizers: List[MegatronOptimizer]):
        self.model_chunks = []
        # chained_optimizers would be empty in the case that a rank
        # has no trainable parameters
        if chained_optimizers:
            self.config = getattr(chained_optimizers[0], 'config', None)
            for optimizer in chained_optimizers:
                if hasattr(optimizer, 'model_chunks'):
                    for model_chunk in optimizer.model_chunks:
                        if model_chunk not in self.model_chunks:
                            self.model_chunks.append(model_chunk)
                assert self.config == getattr(optimizer, 'config', None)
            # If all optimizers are stub optimizers, the ChainedOptimizer is also a stub optimizer
            self.is_stub_optimizer = all(
                getattr(optimizer, 'is_stub_optimizer', False) for optimizer in chained_optimizers
            )

        else:
            self.is_stub_optimizer = True
        self.chained_optimizers = chained_optimizers

    @property
    def optimizer(self):
        """
        Access underlying optimizer when only one optimizer included for backward compatibility.
        """
        assert (
            len(self.chained_optimizers) == 1
        ), "ChainedOptimizer has more than one optimizer when accessing self.optimizer"
        return self.chained_optimizers[0].optimizer

    @property
    def param_groups(self) -> List[dict]:
        """Get param_groups aggregated over underlying optimizers."""
        param_groups = []
        for optimizer in self.chained_optimizers:
            param_groups += optimizer.param_groups
        return param_groups

    @property
    def state(self) -> ProxyDict:
        """
        Return optimizer state with tuple keys, where the first element is the
        index of the optimizer in the list of chained optimizers.
        """
        return ProxyDict([opt.state for opt in self.chained_optimizers])

    def zero_grad(self, set_to_none=True):
        for optimizer in self.chained_optimizers:
            optimizer.zero_grad(set_to_none)

    def get_loss_scale(self):
        if self.chained_optimizers:
            return self.chained_optimizers[0].get_loss_scale()
        else:
            return torch.tensor([1.0], dtype=torch.float32, device=torch.cuda.current_device())

    def _split_state_dict(self, state_dict):
        """Split the state dict into sub-state dicts according to the chunks of each sub-optimizer
        in this chained optimizer.

        For example, assume there are two sub-optimizers in total: the first has 1 model chunk, and
        the second has 7 model chunks. The state dict contains model0 ~ model7. This function splits
        the state dict into two sub-state dicts: the first contains model0, and the second contains
        model1 ~ model7 (but renamed as model0 ~ model6).
        """
        state_dicts = [None] * len(self.chained_optimizers)
        if state_dict is not None:
            if len(self.model_chunks) == 1:
                state_dicts[0] = state_dict
            else:
                # Split state_dict if needed
                prefix = "model" if "model0" in state_dict.keys() else "model_"
                offset = 0
                for optimizer_idx, optimizer in enumerate(self.chained_optimizers):
                    if hasattr(optimizer, "model_chunks"):
                        d = {}
                        for chunk_idx in range(len(optimizer.model_chunks)):
                            assert (
                                f"{prefix}{offset}" in state_dict
                            ), f"Wrong state_dict format, cannot find '{prefix}{offset}'"
                            d[f"{prefix}{chunk_idx}"] = state_dict[f"{prefix}{offset}"]
                            offset += 1
                        if len(d) > 0:
                            state_dicts[optimizer_idx] = d
        return state_dicts

    def reload_model_params(self, state_dict=None):
        state_dicts = self._split_state_dict(state_dict)
        for idx, optimizer in enumerate(self.chained_optimizers):
            optimizer.reload_model_params(state_dict=state_dicts[idx])

    def state_dict(self):
        if len(self.chained_optimizers) == 1:
            return self.chained_optimizers[0].state_dict()
        else:
            return [optimizer.state_dict() for optimizer in self.chained_optimizers]

    def sharded_state_dict(
        self, model_sharded_state_dict: ShardedStateDict, is_loading: bool = False, **kwargs
    ):
        metadata = kwargs.get('metadata') or {}
        # ChainedOptimizer should add its prefix to the tensor state keys only if
        # DistributedOptimizer is used (non-empty 'distrib_optim_sharding_type') and uses
        # a non fully-reshardable format. For backward compatibility we also add it
        # if `chained_optim_avoid_prefix` is False.
        from .distrib_optimizer import DistributedOptimizer

        should_add_prefix = (
            "distrib_optim_sharding_type" in metadata
            and metadata["distrib_optim_sharding_type"]
            not in DistributedOptimizer.checkpoint_fully_reshardable_formats
        ) or not metadata.get('chained_optim_avoid_prefix', False)

        if len(self.chained_optimizers) == 1:
            return self.chained_optimizers[0].sharded_state_dict(
                model_sharded_state_dict, is_loading, **kwargs
            )
        else:
            self._synchronize_steps()
            sharded_state_dict = {}
            for optimizer_idx, optimizer in enumerate(self.chained_optimizers):
                optim_state_dict = optimizer.sharded_state_dict(
                    model_sharded_state_dict, is_loading, **kwargs
                )
                if should_add_prefix:
                    add_prefix_for_sharding(optim_state_dict, f'chained_{optimizer_idx}.')
                sharded_state_dict[optimizer_idx] = optim_state_dict
            return sharded_state_dict

    def load_state_dict(self, state_dict):
        # If there is only one optimizer, we read the state dict as a single optimizer.
        if len(self.chained_optimizers) == 1:
            self.chained_optimizers[0].load_state_dict(state_dict)
            return
        if len(self.chained_optimizers) != len(state_dict):
            raise RuntimeError(
                f'Expected {len(self.chained_optimizers)} entries'
                f' in state dict, but got {len(state_dict)}.'
            )
        if isinstance(state_dict, dict):
            state_dict = (v for k, v in sorted(state_dict.items()))
        for optimizer, state in zip(self.chained_optimizers, state_dict):
            optimizer.load_state_dict(state)
        self._synchronize_steps()

    @torch.no_grad()
    def prepare_grads(self) -> bool:
        """Pre-processing gradients before the optimizer step, returns whether inf/nan is found."""
        found_inf_flag = False
        for optimizer in self.chained_optimizers:
            found_inf_flag |= optimizer.prepare_grads()

        return found_inf_flag

    @torch.no_grad()
    def step_with_ready_grads(self) -> bool:
        """Step the optimizer with ready gradients, return successful."""
        success = True
        for optimizer_idx, optimizer in enumerate(self.chained_optimizers):
            success &= optimizer.step_with_ready_grads()
            if self.config.overlap_param_gather_with_optimizer_step and optimizer_idx == 0:
                assert success
                assert len(optimizer.model_chunks) == 1
                optimizer.model_chunks[0].start_param_sync(force_dispatch=True)

        return success

    def grads_states_parallel_group_is_shared(self):
        """Check if all optimizers share the same gradient statistics parallel group."""
        reference_group = self.chained_optimizers[0].get_grad_stats_parallel_group()
        return all(
            optimizer.get_grad_stats_parallel_group() == reference_group
            for optimizer in self.chained_optimizers
        )

    def get_grad_stats_parallel_group(self) -> torch.distributed.ProcessGroup:
        assert self.grads_states_parallel_group_is_shared(), (
            "Can't use get_grad_stats_parallel_group() for ChainedOptimizer, "
            "since grads states parallel group are not shared across all optimizers"
        )
        return self.chained_optimizers[0].get_grad_stats_parallel_group()

    @torch.no_grad()
    def get_grad_norm(self):
        if len(self.chained_optimizers) == 1:
            return self.chained_optimizers[0].get_grad_norm()
        if self.grads_states_parallel_group_is_shared():
            grads_for_norm = []
            for optimizer in self.chained_optimizers:
                grads_for_norm += optimizer.get_main_grads_for_grad_norm()
            grad_norm = get_grad_norm_fp32(
                grads_for_norm, grad_stats_parallel_group=self.get_grad_stats_parallel_group()
            )
        else:
            grad_norms = []
            for optimizer in self.chained_optimizers:
                _grad_norm = optimizer.get_grad_norm()
                grad_norms += [_grad_norm if _grad_norm else 0.0]
            grad_norm = math.sqrt(sum([x**2 for x in grad_norms]))
        return grad_norm

    @torch.no_grad()
    def count_zeros(self):
        if self.grads_states_parallel_group_is_shared():
            params = []
            for optimizer in self.chained_optimizers:
                params += optimizer.get_parameters()
            return count_zeros_fp32(
                params,
                grad_stats_parallel_group=self.get_grad_stats_parallel_group(),
                use_decoupled_grad=self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8,
            )
        else:
            num_zeros_in_grad = 0
            for optimizer in self.chained_optimizers:
                num_zeros_in_grad += (
                    optimizer.count_zeros() if optimizer.config.log_num_zeros_in_grad else 0
                )
            return num_zeros_in_grad

    @torch.no_grad()
    def step(self):
        """ChainedOptimizer will step all optimizers one by one."""
        found_inf_flag = self.prepare_grads()
        if found_inf_flag:
            return False, None, None

        grad_norm = self.get_grad_norm()

        # Clip gradients.
        for optimizer in self.chained_optimizers:
            if hasattr(optimizer, 'is_stub_optimizer') and optimizer.is_stub_optimizer:
                continue
            parameters = optimizer.get_parameters()
            if len(parameters) == 0:
                continue
            if optimizer.config.clip_grad > 0.0:
                clip_grad_by_total_norm_fp32(
                    parameters,
                    max_norm=optimizer.config.clip_grad,
                    total_norm=grad_norm,
                    use_decoupled_grad=(
                        optimizer.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8
                    ),
                )

        # Count the zeros in the grads.
        num_zeros_in_grad = self.count_zeros() if self.config.log_num_zeros_in_grad else None

        update_successful = self.step_with_ready_grads()

        return update_successful, grad_norm, num_zeros_in_grad

    def save_parameter_state(self, filename: str):
        """Save the distributed parameter states of all optimizers to a file.

        Args:
            filename (str): path to save parameter state to.
        """
        if len(self.chained_optimizers) == 1:
            self.chained_optimizers[0].save_parameter_state(filename)
            return
        save_states = False
        states = []
        for optimizer in self.chained_optimizers:
            if hasattr(optimizer, 'get_parameter_state_dp_zero'):
                state_dict = optimizer.get_parameter_state_dp_zero()

                # Save checkpoint economically, only when DP rank = 0, state dict
                # needs to be saved.
                if optimizer.data_parallel_group.rank() == 0:
                    states.append(state_dict)
                    save_states = True
                else:
                    assert state_dict is None
                    states.append(None)

        if save_states:
            torch.save(states, filename)

    def load_parameter_state(self, filename: str, *, update_legacy_format: bool = False):
        """Load the distributed parameter states of all optimizers from a file.

        Args:
            filename (str): path to load parameter state from.
        """
        if len(self.chained_optimizers) == 1:
            self.chained_optimizers[0].load_parameter_state(
                filename, update_legacy_format=update_legacy_format
            )
            return
        states = None
        for idx, optimizer in enumerate(self.chained_optimizers):
            if not hasattr(optimizer, 'load_parameter_state_from_dp_zero'):
                continue

            # Lazy loading checkpoint, state dict is needed only when DP rank = 0.
            if optimizer.data_parallel_group.rank() == 0 and states is None:
                states = torch.load(filename)

            state_dict = states[idx] if states else None
            optimizer.load_parameter_state_from_dp_zero(
                state_dict, update_legacy_format=update_legacy_format
            )

    def _synchronize_steps(self):
        """
        Synchronize the step of all optimizers.
        TE FusedAdam will not accumulate "step" for empty param groups,
        so we need to align the step across param groups before saving and after loading.
        """

        steps = []
        for optimizer in self.chained_optimizers:
            for param_group in optimizer.optimizer.param_groups:
                if len(param_group['params']) > 0 and 'step' in param_group:
                    steps.append(param_group['step'])
        steps = list(set(steps))
        assert len(steps) <= 1, f"steps: {steps}"
        step = steps[0] if len(steps) == 1 else None
        for optimizer in self.chained_optimizers:
            for param_group in optimizer.optimizer.param_groups:
                if len(param_group['params']) > 0 and 'step' in param_group:
                    param_group['step'] = step

        return step

    def offload_to_cpu(self):
        """Move optimizer state to CPU to free GPU memory during inference."""
        for optimizer in self.chained_optimizers:
            optimizer.offload_to_cpu()

    def restore_from_cpu(self):
        """Restore optimizer state from CPU back to GPU for training."""
        for optimizer in self.chained_optimizers:
            optimizer.restore_from_cpu()
