# LayerNorm Kernel with Per-Example Gradient Norms

To use, create a module as follows:

```python
from normgnorm import PEGLayerNorm

class LayerNorm(nn.Module):
  def __init__(self, normalized_shape):
    super().__init__()
    self.weight = nn.Parameter(init_weight(normalized_shape))
    self.bias = nn.Parameter(init_bias(normalized_shape))

    # After backward pass, the per-example gradient sq. norms will be
    # in self.weight_pegsqnorm.grad[0] and self.bias_pegsqnorm.grad[0]
    # self.weight_pegsqnorm.grad[1] and self.bias_pegsqnorm.grad[1] is the number of
    # training examples seen

    self.weight_pegsqnorm = nn.Parameter(torch.zeros(2, device="cuda"))
    self.bias_pegsqnorm = nn.Parameter(torch.zeros(2, device="cuda"))
    self.eps = 1e-5
    self.normalized_shape = normalized_shape

  def forward(self, x):
    return PEGLayerNorm.apply(x, self.weight, self.bias,
                              self.weight_pegsqnorm,
                              self.bias_pegsqnorm,
                              self.normalized_shape,
                              self.eps)
```
