import torch
def add_gaussian_noise(tensor: torch.Tensor, weight: float=0.2) -> torch.Tensor:
    std = tensor.std()
    mean=tensor.mean()
    noise = torch.randn_like(tensor) * std + mean
    return (1-weight)*tensor + weight*noise

def pad_and_apply_attention_mask(hiddenstates, attention_mask):
    if hiddenstates is None:
        return None
    if hiddenstates.shape[1] != attention_mask.shape[1]:
        batch_size, _, hidden_dim = hiddenstates.shape
        padding = torch.zeros(batch_size, 1, hidden_dim, device=hiddenstates.device)
        padded_hiddenstates = torch.cat([hiddenstates, padding], dim=1)  # [B, S, D]
    else:
        padded_hiddenstates=hiddenstates

    mask = attention_mask.unsqueeze(-1)  # [B, S] -> [B, S, 1]
    masked_hiddenstates = padded_hiddenstates * mask.float() 
    
    return masked_hiddenstates 

