# mypy: allow-untyped-defs
r"""Implementation for Stochastic Gradient Descent optimizer."""
from typing import cast, List, Optional, Union, Tuple

import torch
from torch import Tensor

from torch.optim.optimizer import (
    _default_to_fused_or_foreach,
    _use_grad_for_differentiable,
    Optimizer,
)

# We assume torch.optim.adamw is available
from torch.optim.adamw import (
    AdamW,
    adamw
)

# We also assume SgdWithSign and its functional backend sgd_advanced
# are defined elsewhere if needed, but they are not direct dependencies
# for the final SoftSignumWithAdam implementation.

def _single_tensor_softsignum_sgd(
    params: List[Tensor],
    grads: List[Tensor],
    momentum_buffer_list: List[Optional[Tensor]],
    grad_scale: Optional[Tensor],
    found_inf: Optional[Tensor],
    *,
    weight_decay: float,
    momentum: float,
    lr: float,
    dampening: float,
    nesterov: bool,
    maximize: bool,
    current_iter: int,
    warmup_iters: int,
    tmin: float,
    tmax: float,
    has_sparse_grad: bool,
):
    """
    Contains the core logic for a single tensor SoftSignum SGD update.
    This is an internal function.
    """
    assert grad_scale is None and found_inf is None

    # Calculate temperature for softsign annealing
    progress = min(current_iter, warmup_iters) / warmup_iters if warmup_iters > 0 else 1.0
    temperature = tmax + (tmin - tmax) * progress
    
    for i, param in enumerate(params):
        grad = grads[i] if not maximize else -grads[i]

        if weight_decay != 0:
            # This is decoupled weight decay, consistent with AdamW and modern SGD
            param.mul_(1 - lr * weight_decay)

        if momentum != 0:
            buf = momentum_buffer_list[i]

            if buf is None:
                buf = torch.clone(grad).detach()
                momentum_buffer_list[i] = buf
            else:
                buf.mul_(momentum).add_(grad, alpha=1 - dampening)

            if nesterov:
                grad = grad.add(buf, alpha=momentum)
            else:
                grad = buf
        
        # Apply softsign to the momentum-accumulated gradient
        update_vec = torch.tanh(grad * temperature / 2.0)
        
        param.add_(update_vec, alpha=-lr)


def softsignum_sgd(
    params: List[Tensor],
    d_p_list: List[Tensor],
    momentum_buffer_list: List[Optional[Tensor]],
    has_sparse_grad: bool = False,
    foreach: Optional[bool] = None,
    fused: Optional[bool] = None,
    grad_scale: Optional[Tensor] = None,
    found_inf: Optional[Tensor] = None,
    *,
    weight_decay: float,
    momentum: float,
    lr: float,
    dampening: float,
    nesterov: bool,
    maximize: bool,
    current_iter: int,
    warmup_iters: int,
    tmin: float,
    tmax: float,
):
    r"""Functional API that performs SoftSignum SGD algorithm computation."""
    if foreach is None:
        # if not torch.jit.is_scripting():
        #     _, foreach = _default_to_fused_or_foreach(params, differentiable=False, use_fused=False)
        # else:
        #     foreach = False
        foreach = False

    if foreach and torch.jit.is_scripting():
        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
    if fused and torch.jit.is_scripting():
        raise RuntimeError("torch.jit.script not supported with fused optimizers")

    if foreach:
        raise NotImplementedError("`foreach` option is not implemented for SoftSignum SGD")
    elif fused:
        raise NotImplementedError("`fused` option is not implemented for SoftSignum SGD")
    else:
        func = _single_tensor_softsignum_sgd

    func(
        params,
        d_p_list,
        momentum_buffer_list,
        weight_decay=weight_decay,
        momentum=momentum,
        lr=lr,
        dampening=dampening,
        nesterov=nesterov,
        has_sparse_grad=has_sparse_grad,
        maximize=maximize,
        grad_scale=grad_scale,
        found_inf=found_inf,
        current_iter=current_iter,
        warmup_iters=warmup_iters,
        tmin=tmin,
        tmax=tmax,
    )


class SoftSignumSGD(Optimizer):
    def __init__(
        self,
        params,
        lr: Union[float, Tensor] = 1e-3,
        momentum: float = 0,
        dampening: float = 0,
        weight_decay: float = 0,
        nesterov: bool = False,
        warmup_iters: int = 1000,
        tmin: float = 2.0,
        tmax: float = 20.0,
        *,
        maximize: bool = False,
        foreach: Optional[bool] = None,
        differentiable: bool = False,
        fused: Optional[bool] = None,
    ):
        if isinstance(lr, Tensor) and lr.numel() != 1:
            raise ValueError("Tensor lr must be 1-element")
        if lr < 0.0:
            raise ValueError(f"Invalid learning rate: {lr}")
        if momentum < 0.0:
            raise ValueError(f"Invalid momentum value: {momentum}")
        if weight_decay < 0.0:
            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        if warmup_iters < 0:
            raise ValueError(f"Invalid warmup_iters value: {warmup_iters}")

        defaults = dict(
            lr=lr,
            momentum=momentum,
            dampening=dampening,
            weight_decay=weight_decay,
            nesterov=nesterov,
            warmup_iters=warmup_iters,
            tmin=tmin,
            tmax=tmax,
            maximize=maximize,
            foreach=foreach,
            differentiable=differentiable,
            fused=fused,
        )
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        
        super().__init__(params, defaults)
        self.state.setdefault('step', 0)

    def __setstate__(self, state):
        super().__setstate__(state)
        # Ensure state keys are present even when loading from older checkpoints
        self.state.setdefault('step', 0)
        for group in self.param_groups:
            group.setdefault("nesterov", False)
            group.setdefault("maximize", False)
            group.setdefault("foreach", None)
            group.setdefault("differentiable", False)
            group.setdefault("fused", False)

    def _init_group(self, group, params, grads, momentum_buffer_list):
        has_sparse_grad = False
        for p in group['params']:
            if p.grad is not None:
                params.append(p)
                grads.append(p.grad)
                if p.grad.is_sparse:
                    has_sparse_grad = True

                state = self.state[p]
                if 'momentum_buffer' not in state:
                    momentum_buffer_list.append(None)
                else:
                    momentum_buffer_list.append(state['momentum_buffer'])

        return has_sparse_grad

    @_use_grad_for_differentiable
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()
        
        self.state['step'] += 1
        current_iter = self.state['step']

        for group in self.param_groups:
            params: List[Tensor] = []
            grads: List[Tensor] = []
            momentum_buffer_list: List[Optional[Tensor]] = []

            has_sparse_grad = self._init_group(
                group, params, grads, momentum_buffer_list
            )

            softsignum_sgd(
                params,
                grads,
                momentum_buffer_list,
                weight_decay=group["weight_decay"],
                momentum=group["momentum"],
                lr=group["lr"],
                dampening=group["dampening"],
                nesterov=group["nesterov"],
                maximize=group["maximize"],
                current_iter=current_iter,
                warmup_iters=group["warmup_iters"],
                tmin=group["tmin"],
                tmax=group["tmax"],
                has_sparse_grad=has_sparse_grad,
                foreach=group["foreach"],
                fused=group["fused"],
                grad_scale=getattr(self, "grad_scale", None),
                found_inf=getattr(self, "found_inf", None),
            )

            # update momentum_buffers in state
            if group["momentum"] != 0:
                for p, momentum_buffer in zip(params, momentum_buffer_list):
                    state = self.state[p]
                    state["momentum_buffer"] = momentum_buffer
        return loss


class SoftSignWithAdam(Optimizer):
    def __init__(
        self,
        params,
        lr: Union[float, Tensor] = 1e-3,
        momentum: float = 0,
        dampening: float = 0,
        weight_decay: float = 0,
        nesterov: bool = False,
        warmup_iters: int = 1000,
        tmin: float = 2.0,
        tmax: float = 20.0,
        adam_lr: Union[float, Tensor] = 1e-3,
        adam_betas: Tuple[float, float] = (0.9, 0.999),
        adam_eps: float = 1e-8,
        adam_amsgrad: bool = False,
        *,
        maximize: bool = False,
        foreach: Optional[bool] = None,
        fused: Optional[bool] = None,
        differentiable: bool = False,
    ):
        """
        Implements an optimizer that uses SoftSignum SGD for specified parameter groups
        and AdamW for all other groups.

        Args:
            params (iterable): iterable of parameters to optimize or dicts defining
                parameter groups. Groups with `is_sgd_params: True` or `is_proj_params: True`
                will use SoftSignum SGD.
            lr (float, Tensor): learning rate for SoftSignum SGD (default: 1e-3)
            momentum (float): momentum factor for SoftSignum SGD (default: 0)
            ... (other SoftSignumSGD args) ...
            warmup_iters (int): number of iterations for annealing temperature in SoftSignum.
            tmin (float): minimum temperature for SoftSignum.
            tmax (float): maximum temperature for SoftSignum.
            adam_lr (float, Tensor): learning rate for AdamW (default: 1e-3)
            ... (other AdamW args) ...
        """
        defaults = dict() # Not used directly, but required by Optimizer.__init__
        super().__init__([group for group in params], defaults)
        del self.param_groups # Will be reconstructed

        # This state must be at the top level for the annealing schedule
        self.state.setdefault('step', 0)

        # Separate parameter groups for SoftSignum SGD and AdamW
        sgd_params = [group for group in params if self._is_sgd_group(group)]
        adam_params = [group for group in params if not self._is_sgd_group(group)]

        # Create an internal SoftSignumSGD optimizer to handle defaults and group setup
        softsignum_optimizer = SoftSignumSGD(
            sgd_params,
            lr=lr,
            momentum=momentum,
            dampening=dampening,
            weight_decay=weight_decay,
            nesterov=nesterov,
            warmup_iters=warmup_iters,
            tmin=tmin,
            tmax=tmax,
            maximize=maximize,
            foreach=False, #
            fused=False,   # Keep it simple, no foreach/fused for the SGD part
            differentiable=False,
        )

        # Create an internal AdamW optimizer for the remaining params
        adam_optimizer = torch.optim.AdamW(
            adam_params,
            lr=adam_lr,
            betas=adam_betas,
            eps=adam_eps,
            weight_decay=weight_decay,
            amsgrad=adam_amsgrad,
            maximize=maximize,
            foreach=foreach,
            fused=fused,
            differentiable=differentiable,
        )

        # Combine parameter groups from both optimizers
        self.param_groups = softsignum_optimizer.param_groups + adam_optimizer.param_groups
        
        # Capture the internal _init_group methods for use in the step function
        self._init_group_sgd = SoftSignumSGD._init_group.__get__(self)
        self._init_group_adam = AdamW._init_group.__get__(self)

    def __setstate__(self, state):
        # We need to handle the top-level 'step' state manually
        # Superclass handles param_groups and the state of individual params
        if 'state' in state and 'step' in state['state']:
            # Adjust for the structure where 'step' is inside 'state' dict
             step_state = state['state'].pop('step')
             super().__setstate__(state)
             self.state['step'] = step_state
        else:
            super().__setstate__(state)
        
        self.state.setdefault('step', 0)


    def _is_sgd_group(self, group):
        # A group is designated for SGD-like updates if it has these flags
        return group.get("is_proj_params", False) or group.get("is_sgd_params", False)

    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()
        
        # Increment the global step counter *before* the loop
        self.state['step'] += 1
        current_iter = self.state['step']

        for group in self.param_groups:
            if self._is_sgd_group(group):
                # --- SoftSignum SGD Logic ---
                params: List[Tensor] = []
                grads: List[Tensor] = []
                momentum_buffer_list: List[Optional[Tensor]] = []

                has_sparse_grad = self._init_group_sgd(
                    group, params, grads, momentum_buffer_list
                )

                softsignum_sgd(
                    params,
                    grads,
                    momentum_buffer_list,
                    weight_decay=group["weight_decay"],
                    momentum=group["momentum"],
                    lr=group["lr"],
                    dampening=group["dampening"],
                    nesterov=group["nesterov"],
                    maximize=group["maximize"],
                    # Pass SoftSignum specific parameters
                    current_iter=current_iter,
                    warmup_iters=group["warmup_iters"],
                    tmin=group["tmin"],
                    tmax=group["tmax"],
                    has_sparse_grad=has_sparse_grad,
                    foreach=group["foreach"],
                    fused=group["fused"],
                    grad_scale=getattr(self, "grad_scale", None),
                    found_inf=getattr(self, "found_inf", None),
                )
                
                # Update momentum_buffers in the optimizer's state
                if group["momentum"] != 0:
                    for p, momentum_buffer in zip(params, momentum_buffer_list):
                        state = self.state[p]
                        state["momentum_buffer"] = momentum_buffer
                
            else:
                # --- AdamW Logic (Unchanged from SgdWithAdam) ---
                params_with_grad = []
                grads = []
                exp_avgs = []
                exp_avg_sqs = []
                max_exp_avg_sqs = []
                state_steps = []
                amsgrad = group["amsgrad"]
                beta1, beta2 = group["betas"]

                self._init_group_adam(
                    group,
                    params_with_grad,
                    grads,
                    exp_avgs,
                    exp_avg_sqs,
                    max_exp_avg_sqs,
                    state_steps,
                    amsgrad=amsgrad,
                    beta1=beta1,
                    beta2=beta2,
                )

                adamw(
                    params_with_grad,
                    grads,
                    exp_avgs,
                    exp_avg_sqs,
                    max_exp_avg_sqs,
                    state_steps,
                    amsgrad=amsgrad,
                    beta1=beta1,
                    beta2=beta2,
                    lr=group["lr"],
                    weight_decay=group["weight_decay"],
                    eps=group["eps"],
                    maximize=group["maximize"],
                    foreach=group["foreach"],
                    capturable=group.get("capturable", False),
                    differentiable=group["differentiable"],
                    fused=group["fused"],
                    grad_scale=getattr(self, "grad_scale", None),
                    found_inf=getattr(self, "found_inf", None),
                )

        return loss
