# mypy: allow-untyped-defs
r"""Implementation for an advanced SGD-like optimizer with SoftSignum and AdamW."""
from typing import cast, List, Optional, Union, Tuple

import torch
from torch import Tensor

from torch.optim.optimizer import (
    _default_to_fused_or_foreach,
    _use_grad_for_differentiable,
    Optimizer,
)

from torch.optim.adamw import (
    AdamW,
    adamw
)


def _single_tensor_softsignum_sgd(
    params: List[Tensor],
    grads: List[Tensor],
    momentum_buffer_list: List[Optional[Tensor]],
    grad_scale: Optional[Tensor],
    found_inf: Optional[Tensor],
    *,
    weight_decay: float,
    momentum: float,
    lr: Union[float, Tensor],
    dampening: float,
    nesterov: bool,
    maximize: bool,
    current_iter: int,
    warmup_iters: int,
    sgd_last: bool,
    decoupled_wd: bool,
    wd_warmup: bool,
    only_sign_iters: int,
    tmin: float,
    tmax: float,
    normalized: bool,
    sign_norm: bool,
    has_sparse_grad: bool,
    hook: Optional[callable] = None,
    auto_temperature: bool = False,
    const_temperature: bool = False,
):
    """
    Contains the core logic for a single tensor SoftSignum SGD update with normalization.
    """
    assert grad_scale is None and found_inf is None
    for i, param in enumerate(params):
        grad = grads[i] if not maximize else -grads[i]

        if weight_decay != 0:
            if decoupled_wd:
                if wd_warmup and current_iter - 1 >= only_sign_iters:
                    if auto_temperature and current_iter - 1 == only_sign_iters:
                        grad_min = min(g.abs().min().item() for g in grads)
                        tmax = torch.atanh(torch.tensor(0.999)) * 2 / (grad_min + 1e-8)
                    progress = min(current_iter - 1 - only_sign_iters, warmup_iters) / warmup_iters if warmup_iters > 0 else 1.0
                    if const_temperature:
                        temperature = tmax
                    else:
                        temperature = (1 - progress) * tmax + tmin * progress
                    param.add_(torch.tanh(temperature/2 * weight_decay * param.data), alpha=-lr)
                else:
                    param.mul_(1 - lr * weight_decay)

            else:
                if wd_warmup:
                    raise NotImplementedError
                grad.add_(param.data, alpha=weight_decay)

        if momentum != 0:
            buf = momentum_buffer_list[i]
            if buf is None:
                buf = torch.clone(grad).detach()
                momentum_buffer_list[i] = buf
            else:
                buf.mul_(momentum).add_(grad, alpha=1 - dampening)

            if nesterov:
                grad = grad.add(buf, alpha=momentum)
            else:
                grad = buf
        grads[i] = grad

    if auto_temperature and current_iter - 1 == only_sign_iters:
        grad_min = min(g.abs().min().item() for g in grads)
        tmax = torch.atanh(torch.tensor(0.999)) * 2 / (grad_min + 1e-8)
    if current_iter - 1 >= only_sign_iters:
        progress = min(current_iter - 1 - only_sign_iters, warmup_iters) / warmup_iters if warmup_iters > 0 else 1.0
        if const_temperature:
            temperature = tmax
        else:
            temperature = (1 - progress) * tmax + tmin * progress

    if hook is not None and current_iter - 1 >= only_sign_iters:
        try:
            hook.log_temperature(temperature)
        except:
            print("Hook can't log tmax!")

    effective_lr = lr
    if normalized or sign_norm:
        norms = [torch.linalg.vector_norm(g) for g in grads]
        total_norm = torch.linalg.vector_norm(torch.stack(norms))
        
        if normalized:
            effective_lr = lr / total_norm
        elif sign_norm:
            effective_lr = lr * total_norm

    for i, param in enumerate(params):
        grad = grads[i]
        if current_iter - 1 >= only_sign_iters:
            if current_iter - 1 - only_sign_iters <= warmup_iters: # soft transfer
                update_vec = torch.tanh(grad * temperature / 2.0)
            elif sgd_last:
                update_vec = grad
            else:
                update_vec = torch.tanh(grad * temperature / 2.0)  
        else:
            update_vec = torch.sign(grad)
        if hook is not None:
            hook(grad, update_vec)
        param.add_(update_vec, alpha=-effective_lr)
    
    if auto_temperature and current_iter - 1 == only_sign_iters:
        return tmax
    return


def softsignum_sgd(
    params: List[Tensor],
    d_p_list: List[Tensor],
    momentum_buffer_list: List[Optional[Tensor]],
    has_sparse_grad: bool = False,
    foreach: Optional[bool] = None,
    fused: Optional[bool] = None,
    grad_scale: Optional[Tensor] = None,
    found_inf: Optional[Tensor] = None,
    *,
    weight_decay: float,
    momentum: float,
    lr: Union[float, Tensor],
    dampening: float,
    nesterov: bool,
    maximize: bool,
    current_iter: int,
    warmup_iters: int,
    only_sign_iters: int,
    sgd_last: bool,
    decoupled_wd: bool,
    wd_warmup: bool,
    tmin: float,
    tmax: float,
    normalized: bool,
    sign_norm: bool,
    hook: Optional[callable] = None,
    auto_temperature: bool = False,
    const_temperature: bool = False,
):
    r"""Functional API that performs SoftSignum SGD algorithm computation."""
    if foreach:
        raise NotImplementedError("`foreach` option is not implemented for SoftSignum SGD")
    if fused:
        raise NotImplementedError("`fused` option is not implemented for SoftSignum SGD")

    func = _single_tensor_softsignum_sgd
    new_tmax = func(
        params,
        d_p_list,
        momentum_buffer_list,
        weight_decay=weight_decay,
        momentum=momentum,
        lr=lr,
        dampening=dampening,
        nesterov=nesterov,
        has_sparse_grad=has_sparse_grad,
        maximize=maximize,
        grad_scale=grad_scale,
        found_inf=found_inf,
        current_iter=current_iter,
        warmup_iters=warmup_iters,
        only_sign_iters=only_sign_iters,
        sgd_last=sgd_last,
        decoupled_wd=decoupled_wd,
        wd_warmup=wd_warmup,
        tmin=tmin,
        tmax=tmax,
        normalized=normalized,
        sign_norm=sign_norm,
        hook=hook,
        auto_temperature=auto_temperature,
        const_temperature=const_temperature
    )

    return new_tmax


class SoftSignumSGD(Optimizer):
    def __init__(
        self,
        params,
        lr: Union[float, Tensor] = 1e-3,
        momentum: float = 0,
        dampening: float = 0,
        weight_decay: float = 0,
        nesterov: bool = False,
        warmup_iters: int = 1000,
        only_sign_iters: int = 1000,
        sgd_last: bool = False,
        decoupled_wd: bool = True,
        wd_warmup: bool = False,
        tmin: float = 2.0,
        tmax: float = 20.0,
        auto_temperature: bool = False,
        const_temperature: bool = False,
        sign_norm: bool = False,
        normalized: bool = False,
        *,
        maximize: bool = False,
        foreach: Optional[bool] = None,
        differentiable: bool = False,
        fused: Optional[bool] = None,
        hook: Optional[callable] = None,
    ):
        if not 0.0 <= lr:
            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= momentum:
            raise ValueError(f"Invalid momentum value: {momentum}")
        if not 0.0 <= weight_decay:
            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        if sign_norm and normalized:
            raise ValueError(f"sign_norm and normalized are mutually exclusive")

        defaults = dict(
            lr=lr,
            momentum=momentum,
            dampening=dampening,
            weight_decay=weight_decay,
            nesterov=nesterov,
            warmup_iters=warmup_iters,
            only_sign_iters=only_sign_iters,
            sgd_last=sgd_last,
            decoupled_wd=decoupled_wd,
            wd_warmup=wd_warmup,
            tmin=tmin,
            tmax=tmax,
            auto_temperature=auto_temperature,
            const_temperature=const_temperature,
            sign_norm=sign_norm,
            normalized=normalized,
            maximize=maximize,
            foreach=foreach,
            differentiable=differentiable,
            fused=fused,
        )
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
        
        super().__init__(params, defaults)
        self.state.setdefault('step', 0)
        self.hook = hook
        
    def __setstate__(self, state):
        super().__setstate__(state)
        self.state.setdefault('step', 0)
        for group in self.param_groups:
            group.setdefault("nesterov", False)
            group.setdefault("maximize", False)
            group.setdefault("foreach", None)
            group.setdefault("differentiable", False)
            group.setdefault("fused", False)
            group.setdefault("sign_norm", False)
            group.setdefault("normalized", False)

    def _init_group(self, group, params, grads, momentum_buffer_list):
        has_sparse_grad = False
        for p in group['params']:
            if p.grad is not None:
                params.append(p)
                grads.append(p.grad)
                if p.grad.is_sparse:
                    has_sparse_grad = True
                state = self.state[p]
                momentum_buffer_list.append(state.get('momentum_buffer'))
        return has_sparse_grad

    @_use_grad_for_differentiable
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()
        
        self.state['step'] += 1
        current_iter = self.state['step']

        for group in self.param_groups:
            params: List[Tensor] = []
            grads: List[Tensor] = []
            momentum_buffer_list: List[Optional[Tensor]] = []

            has_sparse_grad = self._init_group(
                group, params, grads, momentum_buffer_list
            )

            new_tmax = softsignum_sgd(
                params,
                grads,
                momentum_buffer_list,
                weight_decay=group["weight_decay"],
                momentum=group["momentum"],
                lr=group["lr"],
                dampening=group["dampening"],
                nesterov=group["nesterov"],
                maximize=group["maximize"],
                current_iter=current_iter,
                warmup_iters=group["warmup_iters"],
                only_sign_iters=group["only_sign_iters"],
                sgd_last=group["sgd_last"],
                decoupled_wd=group["decoupled_wd"],
                wd_warmup=group['wd_warmup'],
                tmin=group["tmin"],
                tmax=group["tmax"],
                auto_temperature=group['auto_temperature'],
                const_temperature=group['const_temperature'],
                normalized=group["normalized"],
                sign_norm=group["sign_norm"],
                has_sparse_grad=has_sparse_grad,
                foreach=group["foreach"],
                fused=group["fused"],
                grad_scale=getattr(self, "grad_scale", None),
                found_inf=getattr(self, "found_inf", None),
                hook=self.hook
            )
            
            if group["auto_temperature"] and new_tmax is not None:
                group["tmax"] = new_tmax

            if group["momentum"] != 0:
                for p, momentum_buffer in zip(params, momentum_buffer_list):
                    self.state[p]["momentum_buffer"] = momentum_buffer
        return loss


class SoftSignumWithAdam(Optimizer):
    def __init__(
        self,
        params,
        lr: Union[float, Tensor] = 1e-3,
        momentum: float = 0,
        dampening: float = 0,
        weight_decay: float = 0,
        nesterov: bool = False,
        warmup_iters: int = 1000,
        only_sign_iters: int = 1000,
        sgd_last: bool = False,
        decoupled_wd: bool = True,
        tmin: float = 2.0,
        tmax: float = 20.0,
        auto_temperature: bool = False,
        const_temperature: bool = False,
        sign_norm: bool = False,
        normalized: bool = False,
        adam_lr: Union[float, Tensor] = 1e-3,
        adam_betas: Tuple[float, float] = (0.9, 0.999),
        adam_eps: float = 1e-8,
        adam_amsgrad: bool = False,
        *,
        maximize: bool = False,
        foreach: Optional[bool] = None,
        fused: Optional[bool] = None,
        differentiable: bool = False,
        hook: Optional[callable] = None,
    ):
        if sign_norm and normalized:
            raise ValueError(f"sign_norm and normalized are mutually exclusive")

        defaults = dict()
        super().__init__([group for group in params], defaults)
        del self.param_groups

        self.state.setdefault('step', 0)

        sgd_params = [group for group in params if self._is_sgd_group(group)]
        adam_params = [group for group in params if not self._is_sgd_group(group)]

        softsignum_optimizer = SoftSignumSGD(
            sgd_params,
            lr=lr,
            momentum=momentum,
            dampening=dampening,
            weight_decay=weight_decay,
            nesterov=nesterov,
            warmup_iters=warmup_iters,
            only_sign_iters=only_sign_iters,
            sgd_last=sgd_last,
            decoupled_wd=decoupled_wd,
            tmin=tmin,
            tmax=tmax,
            auto_temperature=auto_temperature,
            const_temperature=const_temperature,
            sign_norm=sign_norm,
            normalized=normalized,
            maximize=maximize,
            foreach=False,
            fused=False,
            differentiable=False,
            hook=hook,
        )

        adam_optimizer = torch.optim.AdamW(
            adam_params,
            lr=adam_lr,
            betas=adam_betas,
            eps=adam_eps,
            weight_decay=weight_decay,
            amsgrad=adam_amsgrad,
            maximize=maximize,
            foreach=foreach,
            fused=fused,
            differentiable=differentiable,
        )

        self.param_groups = softsignum_optimizer.param_groups + adam_optimizer.param_groups
        self._init_group_sgd = SoftSignumSGD._init_group.__get__(self)
        self._init_group_adam = AdamW._init_group.__get__(self)
        self.hook = hook

    def __setstate__(self, state):
        if 'state' in state and 'step' in state['state']:
             step_state = state['state'].pop('step')
             super().__setstate__(state)
             self.state['step'] = step_state
        else:
            super().__setstate__(state)
        self.state.setdefault('step', 0)

    def _is_sgd_group(self, group):
        return group.get("is_proj_params", False) or group.get("is_sgd_params", False)

    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()
        
        self.state['step'] += 1
        current_iter = self.state['step']

        for group in self.param_groups:
            if self._is_sgd_group(group):
                params: List[Tensor] = []
                grads: List[Tensor] = []
                momentum_buffer_list: List[Optional[Tensor]] = []

                has_sparse_grad = self._init_group_sgd(
                    group, params, grads, momentum_buffer_list
                )

                new_tmax = softsignum_sgd(
                    params,
                    grads,
                    momentum_buffer_list,
                    weight_decay=group["weight_decay"],
                    momentum=group["momentum"],
                    lr=group["lr"],
                    dampening=group["dampening"],
                    nesterov=group["nesterov"],
                    maximize=group["maximize"],
                    current_iter=current_iter,
                    warmup_iters=group["warmup_iters"],
                    only_sign_iters=group["only_sign_iters"],
                    sgd_last=group["sgd_last"],
                    decoupled_wd=group["decoupled_wd"],
                    tmin=group["tmin"],
                    tmax=group["tmax"],
                    normalized=group["normalized"],
                    sign_norm=group["sign_norm"],
                    has_sparse_grad=has_sparse_grad,
                    foreach=group["foreach"],
                    fused=group["fused"],
                    grad_scale=getattr(self, "grad_scale", None),
                    found_inf=getattr(self, "found_inf", None),
                    auto_temperature=group["auto_temperature"],
                    const_temperature=group["const_temperature"],
                    hook=getattr(self, "hook", None),
                )
                
                if group["auto_temperature"] and new_tmax is not None:
                    group["tmax"] = new_tmax
                
                if group["momentum"] != 0:
                    for p, momentum_buffer in zip(params, momentum_buffer_list):
                        self.state[p]["momentum_buffer"] = momentum_buffer
                
            else:
                params_with_grad = []
                grads = []
                exp_avgs = []
                exp_avg_sqs = []
                max_exp_avg_sqs = []
                state_steps = []
                amsgrad = group["amsgrad"]
                beta1, beta2 = group["betas"]

                self._init_group_adam(
                    group,
                    params_with_grad,
                    grads,
                    amsgrad,
                    exp_avgs,
                    exp_avg_sqs,
                    max_exp_avg_sqs,
                    state_steps,
                )

                adamw(
                    params_with_grad,
                    grads,
                    exp_avgs,
                    exp_avg_sqs,
                    max_exp_avg_sqs,
                    state_steps,
                    amsgrad=amsgrad,
                    beta1=beta1,
                    beta2=beta2,
                    lr=group["lr"],
                    weight_decay=group["weight_decay"],
                    eps=group["eps"],
                    maximize=group["maximize"],
                    foreach=group["foreach"],
                    capturable=group.get("capturable", False),
                    differentiable=group["differentiable"],
                    fused=group["fused"],
                    grad_scale=getattr(self, "grad_scale", None),
                    found_inf=getattr(self, "found_inf", None),
                )

        return loss

