import math
import torch
from torch.optim.optimizer import Optimizer

class Muon(Optimizer):
    """
    A simple momentum-based optimizer with decoupled weight decay.
    This serves as a lightweight stand-in for Muon-style optimizers.

    Args:
        params: iterable of parameters to optimize
        lr: learning rate
        momentum: momentum factor
        weight_decay: decoupled weight decay (L2)
        nesterov: whether to use Nesterov momentum
    """
    def __init__(self, params, lr=1e-3, momentum=0.9, weight_decay=0.0, nesterov=False):
        if lr <= 0.0:
            raise ValueError(f"Invalid lr: {lr}")
        if momentum < 0.0:
            raise ValueError(f"Invalid momentum: {momentum}")
        defaults = dict(lr=lr, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov)
        super().__init__(params, defaults)

    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group['lr']
            momentum = group['momentum']
            wd = group['weight_decay']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad

                # Decoupled weight decay
                if wd != 0.0:
                    p.data.mul_(1.0 - lr * wd)

                state = self.state[p]
                if len(state) == 0:
                    state['momentum_buffer'] = torch.zeros_like(p)
                buf = state['momentum_buffer']

                buf.mul_(momentum).add_(d_p)
                if nesterov:
                    update = d_p.add(buf, alpha=momentum)
                else:
                    update = buf

                p.add_(update, alpha=-lr)

        return loss

__all__ = ["Muon"] 