"""Hooked Transformer RMS Norm Component.

This module contains all the component :class:`RMSNorm`.
"""
from typing import Dict, Optional, Union

import torch
import torch.nn as nn
from jaxtyping import Float

from transformer_lens.hook_points import HookPoint
from transformer_lens.HookedTransformerConfig import HookedTransformerConfig


class RMSNorm(nn.Module):
    def __init__(self, cfg: Union[Dict, HookedTransformerConfig], length: Optional[int] = None):
        """
        RMSNorm - LayerNorm without the centering and bias (RMS = Root Mean Square)

        length (Optional[int]): If the dimension of the RMSNorm. If not provided, assumed to be d_model
        """
        super().__init__()
        self.cfg = HookedTransformerConfig.unwrap(cfg)
        self.eps = self.cfg.eps
        if length is None:
            self.length = self.cfg.d_model
        else:
            self.length = length

        self.w = nn.Parameter(torch.ones(self.length, dtype=self.cfg.dtype))

        # Adds a hook point for the normalisation scale factor
        self.hook_scale = HookPoint()  # [batch, pos, 1]
        self.hook_normalized = HookPoint()  # [batch, pos, length]
        self.hook_out = HookPoint()  # [batch, pos, length]

    def forward(
        self, x: Float[torch.Tensor, "batch pos length"]
    ) -> Float[torch.Tensor, "batch pos length"]:
        # if self.cfg.dtype not in [torch.float32, torch.float64]:
        #     x = x.to(torch.float32)
        # scale: Float[torch.Tensor, "batch pos 1"] = self.hook_scale(
        #     (x.pow(2).mean(-1, keepdim=True) + self.eps).sqrt()
        # )
        # x = self.hook_normalized(x / scale).to(self.cfg.dtype)  # [batch, pos, length]
        # return self.hook_out(x * self.w)
        input_dtype = x.dtype
        assert input_dtype == torch.float32
        scale = self.hook_scale(torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps))
        x = self.hook_normalized(x * scale)
        return self.hook_out(self.w * x)
