# mypy: allow-untyped-defs
r"""Implementation for an advanced SGD-like optimizer with SoftSignum and AdamW."""

from typing import List, Optional, Tuple, Union, cast

import torch
from torch import Tensor
from torch.optim.adamw import AdamW, adamw
from torch.optim.optimizer import (
    Optimizer,
    _default_to_fused_or_foreach,
    _use_grad_for_differentiable,
)


def _single_tensor_softsignum_sgd(
    params: List[Tensor],
    grads: List[Tensor],
    momentum_buffer_list: List[Optional[Tensor]],
    grad_scale: Optional[Tensor],
    found_inf: Optional[Tensor],
    *,
    weight_decay: float,
    momentum: float,
    lr: Union[float, Tensor],
    dampening: float,
    nesterov: bool,
    maximize: bool,
    current_iter: int,
    warmup_iters: int,
    sgd_last: bool,
    decoupled_wd: bool,
    only_sign_iters: int,
    tmin: float,
    tmax: float,
    normalized: bool,
    sign_norm: bool,
    has_sparse_grad: bool,
    hook: Optional[callable] = None,
    auto_temperature: bool = False,
    const_temperature: bool = False,
):
    """
    Contains the core logic for a single tensor SoftSignum SGD update with normalization.
    """
    assert grad_scale is None and found_inf is None
    temperature = 0.0
    for i, param in enumerate(params):
        grad = grads[i] if not maximize else -grads[i]

        if weight_decay != 0:
            if decoupled_wd:
                param.mul_(1 - lr * weight_decay)
            else:
                grad.add_(param.data, alpha=weight_decay)

        if momentum != 0:
            buf = momentum_buffer_list[i]
            if buf is None:
                buf = torch.clone(grad).detach()
                momentum_buffer_list[i] = buf
            else:
                buf.mul_(momentum).add_(grad, alpha=1 - dampening)

            if nesterov:
                grad = grad.add(buf, alpha=momentum)
            else:
                grad = buf
        grads[i] = grad

    if auto_temperature and current_iter - 1 == only_sign_iters:
        grad_min = min(g.abs().min().item() for g in grads)
        tmax = torch.atanh(torch.tensor(0.999)) * 2 / (grad_min + 1e-8)
    if current_iter - 1 >= only_sign_iters:
        progress = (
            min(current_iter - 1 - only_sign_iters, warmup_iters) / warmup_iters
            if warmup_iters > 0
            else 1.0
        )
        if const_temperature:
            temperature = tmax
        else:
            temperature = (1 - progress) * tmax + tmin * progress

    if hook is not None and current_iter - 1 >= only_sign_iters:
        try:
            hook.log_temperature(temperature)
        except:
            print("Hook can't log tmax!")

    effective_lr = lr
    if normalized or sign_norm:
        norms = [torch.linalg.vector_norm(g) for g in grads]
        total_norm = torch.linalg.vector_norm(torch.stack(norms))

        if normalized:
            effective_lr = lr / total_norm
        elif sign_norm:
            effective_lr = lr * total_norm

    for i, param in enumerate(params):
        grad = grads[i]
        if current_iter - 1 >= only_sign_iters:
            if current_iter - 1 - only_sign_iters <= warmup_iters:  # soft transfer
                update_vec = torch.tanh(grad * temperature / 2.0)
            elif sgd_last:
                update_vec = grad
            else:
                update_vec = torch.tanh(grad * temperature / 2.0)
        else:
            update_vec = torch.sign(grad)
        if hook is not None:
            hook(grad, update_vec)
        param.add_(update_vec, alpha=-effective_lr)

    if auto_temperature and current_iter - 1 == only_sign_iters:
        return tmax, temperature
    return tmax, temperature


def softsignum_sgd(
    params: List[Tensor],
    d_p_list: List[Tensor],
    momentum_buffer_list: List[Optional[Tensor]],
    has_sparse_grad: bool = False,
    foreach: Optional[bool] = None,
    fused: Optional[bool] = None,
    grad_scale: Optional[Tensor] = None,
    found_inf: Optional[Tensor] = None,
    *,
    weight_decay: float,
    momentum: float,
    lr: Union[float, Tensor],
    dampening: float,
    nesterov: bool,
    maximize: bool,
    current_iter: int,
    warmup_iters: int,
    only_sign_iters: int,
    sgd_last: bool,
    decoupled_wd: bool,
    tmin: float,
    tmax: float,
    normalized: bool,
    sign_norm: bool,
    hook: Optional[callable] = None,
    auto_temperature: bool = False,
    const_temperature: bool = False,
):
    r"""Functional API that performs SoftSignum SGD algorithm computation."""
    if foreach:
        raise NotImplementedError(
            "`foreach` option is not implemented for SoftSignum SGD"
        )
    if fused:
        raise NotImplementedError(
            "`fused` option is not implemented for SoftSignum SGD"
        )

    func = _single_tensor_softsignum_sgd
    new_tmax, temperature = func(
        params,
        d_p_list,
        momentum_buffer_list,
        weight_decay=weight_decay,
        momentum=momentum,
        lr=lr,
        dampening=dampening,
        nesterov=nesterov,
        has_sparse_grad=has_sparse_grad,
        maximize=maximize,
        grad_scale=grad_scale,
        found_inf=found_inf,
        current_iter=current_iter,
        warmup_iters=warmup_iters,
        only_sign_iters=only_sign_iters,
        sgd_last=sgd_last,
        decoupled_wd=decoupled_wd,
        tmin=tmin,
        tmax=tmax,
        normalized=normalized,
        sign_norm=sign_norm,
        hook=hook,
        auto_temperature=auto_temperature,
        const_temperature=const_temperature,
    )

    return new_tmax, temperature


class SoftSignumSGD(Optimizer):
    def __init__(
        self,
        params,
        lr: Union[float, Tensor] = 1e-3,
        momentum: float = 0,
        dampening: float = 0,
        weight_decay: float = 0,
        nesterov: bool = False,
        warmup_iters: int = 1000,
        only_sign_iters: int = 1000,
        sgd_last: bool = False,
        decoupled_wd: bool = True,
        tmin: float = 2.0,
        tmax: float = 20.0,
        auto_temperature: bool = False,
        const_temperature: bool = False,
        sign_norm: bool = False,
        normalized: bool = False,
        *,
        maximize: bool = False,
        foreach: Optional[bool] = None,
        differentiable: bool = False,
        fused: Optional[bool] = None,
        hook: Optional[callable] = None,
    ):
        if not 0.0 <= lr:
            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= momentum:
            raise ValueError(f"Invalid momentum value: {momentum}")
        if not 0.0 <= weight_decay:
            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
        if sign_norm and normalized:
            raise ValueError(f"sign_norm and normalized are mutually exclusive")

        defaults = dict(
            lr=lr,
            momentum=momentum,
            dampening=dampening,
            weight_decay=weight_decay,
            nesterov=nesterov,
            warmup_iters=warmup_iters,
            only_sign_iters=only_sign_iters,
            sgd_last=sgd_last,
            decoupled_wd=decoupled_wd,
            tmin=tmin,
            tmax=tmax,
            auto_temperature=auto_temperature,
            const_temperature=const_temperature,
            sign_norm=sign_norm,
            normalized=normalized,
            maximize=maximize,
            foreach=foreach,
            differentiable=differentiable,
            fused=fused,
        )
        if nesterov and (momentum <= 0 or dampening != 0):
            raise ValueError("Nesterov momentum requires a momentum and zero dampening")

        super().__init__(params, defaults)
        self.state.setdefault("step", 0)
        self.hook = hook

    def __setstate__(self, state):
        super().__setstate__(state)
        self.state.setdefault("step", 0)
        for group in self.param_groups:
            group.setdefault("nesterov", False)
            group.setdefault("maximize", False)
            group.setdefault("foreach", None)
            group.setdefault("differentiable", False)
            group.setdefault("fused", False)
            group.setdefault("sign_norm", False)
            group.setdefault("normalized", False)

    def _init_group(self, group, params, grads, momentum_buffer_list):
        has_sparse_grad = False
        for p in group["params"]:
            if p.grad is not None:
                params.append(p)
                grads.append(p.grad)
                if p.grad.is_sparse:
                    has_sparse_grad = True
                state = self.state[p]
                momentum_buffer_list.append(state.get("momentum_buffer"))
        return has_sparse_grad

    @_use_grad_for_differentiable
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        self.state["step"] += 1
        current_iter = self.state["step"]

        for group in self.param_groups:
            params: List[Tensor] = []
            grads: List[Tensor] = []
            momentum_buffer_list: List[Optional[Tensor]] = []

            has_sparse_grad = self._init_group(
                group, params, grads, momentum_buffer_list
            )

            new_tmax, temperature = softsignum_sgd(
                params,
                grads,
                momentum_buffer_list,
                weight_decay=group["weight_decay"],
                momentum=group["momentum"],
                lr=group["lr"],
                dampening=group["dampening"],
                nesterov=group["nesterov"],
                maximize=group["maximize"],
                current_iter=current_iter,
                warmup_iters=group["warmup_iters"],
                only_sign_iters=group["only_sign_iters"],
                sgd_last=group["sgd_last"],
                decoupled_wd=group["decoupled_wd"],
                tmin=group["tmin"],
                tmax=group["tmax"],
                auto_temperature=group["auto_temperature"],
                const_temperature=group["const_temperature"],
                normalized=group["normalized"],
                sign_norm=group["sign_norm"],
                has_sparse_grad=has_sparse_grad,
                foreach=group["foreach"],
                fused=group["fused"],
                grad_scale=getattr(self, "grad_scale", None),
                found_inf=getattr(self, "found_inf", None),
                hook=self.hook,
            )

            if group["auto_temperature"] and new_tmax is not None:
                group["tmax"] = new_tmax
            group["temperature"] = temperature

            if group["momentum"] != 0:
                for p, momentum_buffer in zip(params, momentum_buffer_list):
                    self.state[p]["momentum_buffer"] = momentum_buffer
        return loss


class SoftSignumSGDWithAdam(Optimizer):
    def __init__(
        self,
        params,
        lr: Union[float, Tensor] = 1e-3,
        momentum: float = 0,
        dampening: float = 0,
        weight_decay: float = 0,
        nesterov: bool = False,
        warmup_iters: int = 1000,
        only_sign_iters: int = 1000,
        sgd_last: bool = False,
        decoupled_wd: bool = True,
        tmin: float = 2.0,
        tmax: float = 20.0,
        auto_temperature: bool = False,
        const_temperature: bool = False,
        sign_norm: bool = False,
        normalized: bool = False,
        adam_lr: Union[float, Tensor] = 1e-3,
        adam_betas: Tuple[float, float] = (0.9, 0.999),
        adam_eps: float = 1e-8,
        adam_amsgrad: bool = False,
        *,
        maximize: bool = False,
        foreach: Optional[bool] = None,
        fused: Optional[bool] = None,
        differentiable: bool = False,
        hook: Optional[callable] = None,
    ):
        if sign_norm and normalized:
            raise ValueError(f"sign_norm and normalized are mutually exclusive")

        defaults = dict()
        super().__init__([group for group in params], defaults)
        del self.param_groups

        self.state.setdefault("step", 0)

        sgd_params = [group for group in params if self._is_sgd_group(group)]
        adam_params = [group for group in params if not self._is_sgd_group(group)]

        softsignum_optimizer = SoftSignumSGD(
            sgd_params,
            lr=lr,
            momentum=momentum,
            dampening=dampening,
            weight_decay=weight_decay,
            nesterov=nesterov,
            warmup_iters=warmup_iters,
            only_sign_iters=only_sign_iters,
            sgd_last=sgd_last,
            decoupled_wd=decoupled_wd,
            tmin=tmin,
            tmax=tmax,
            auto_temperature=auto_temperature,
            const_temperature=const_temperature,
            sign_norm=sign_norm,
            normalized=normalized,
            maximize=maximize,
            foreach=False,
            fused=False,
            differentiable=False,
            hook=hook,
        )

        adam_optimizer = torch.optim.AdamW(
            adam_params,
            lr=adam_lr,
            betas=adam_betas,
            eps=adam_eps,
            weight_decay=weight_decay,
            amsgrad=adam_amsgrad,
            maximize=maximize,
            foreach=foreach,
            fused=fused,
            differentiable=differentiable,
        )

        self.param_groups = (
            softsignum_optimizer.param_groups + adam_optimizer.param_groups
        )
        self._init_group_sgd = SoftSignumSGD._init_group.__get__(self)
        self._init_group_adam = AdamW._init_group.__get__(self)
        self.hook = hook

    def __setstate__(self, state):
        if "state" in state and "step" in state["state"]:
            step_state = state["state"].pop("step")
            super().__setstate__(state)
            self.state["step"] = step_state
        else:
            super().__setstate__(state)
        self.state.setdefault("step", 0)

    def _is_sgd_group(self, group):
        return group.get("is_proj_params", False) or group.get("is_sgd_params", False)

    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        self.state["step"] += 1
        current_iter = self.state["step"]

        for group in self.param_groups:
            if self._is_sgd_group(group):
                params: List[Tensor] = []
                grads: List[Tensor] = []
                momentum_buffer_list: List[Optional[Tensor]] = []

                has_sparse_grad = self._init_group_sgd(
                    group, params, grads, momentum_buffer_list
                )

                new_tmax, temperature = softsignum_sgd(
                    params,
                    grads,
                    momentum_buffer_list,
                    weight_decay=group["weight_decay"],
                    momentum=group["momentum"],
                    lr=group["lr"],
                    dampening=group["dampening"],
                    nesterov=group["nesterov"],
                    maximize=group["maximize"],
                    current_iter=current_iter,
                    warmup_iters=group["warmup_iters"],
                    only_sign_iters=group["only_sign_iters"],
                    sgd_last=group["sgd_last"],
                    decoupled_wd=group["decoupled_wd"],
                    tmin=group["tmin"],
                    tmax=group["tmax"],
                    normalized=group["normalized"],
                    sign_norm=group["sign_norm"],
                    has_sparse_grad=has_sparse_grad,
                    foreach=group["foreach"],
                    fused=group["fused"],
                    grad_scale=getattr(self, "grad_scale", None),
                    found_inf=getattr(self, "found_inf", None),
                    auto_temperature=group["auto_temperature"],
                    const_temperature=group["const_temperature"],
                    hook=getattr(self, "hook", None),
                )

                if group["auto_temperature"] and new_tmax is not None:
                    group["tmax"] = new_tmax
                group["temperature"] = temperature

                if group["momentum"] != 0:
                    for p, momentum_buffer in zip(params, momentum_buffer_list):
                        self.state[p]["momentum_buffer"] = momentum_buffer

            else:
                params_with_grad = []
                grads = []
                exp_avgs = []
                exp_avg_sqs = []
                max_exp_avg_sqs = []
                state_steps = []
                amsgrad = group["amsgrad"]
                beta1, beta2 = group["betas"]

                self._init_group_adam(
                    group,
                    params_with_grad,
                    grads,
                    amsgrad,
                    exp_avgs,
                    exp_avg_sqs,
                    max_exp_avg_sqs,
                    state_steps,
                )

                adamw(
                    params_with_grad,
                    grads,
                    exp_avgs,
                    exp_avg_sqs,
                    max_exp_avg_sqs,
                    state_steps,
                    amsgrad=amsgrad,
                    beta1=beta1,
                    beta2=beta2,
                    lr=group["lr"],
                    weight_decay=group["weight_decay"],
                    eps=group["eps"],
                    maximize=group["maximize"],
                    foreach=group["foreach"],
                    capturable=group.get("capturable", False),
                    differentiable=group["differentiable"],
                    fused=group["fused"],
                    grad_scale=getattr(self, "grad_scale", None),
                    found_inf=getattr(self, "found_inf", None),
                )

        return loss
