import torch


def get_deltas_gpu(A_tensor, B_tensor, similarity_type):
    """GPU-accelerated delta computation with optional numpy output"""
    if similarity_type == 'subtraction':
        deltas = B_tensor.repeat(A_tensor.shape[0], 1) - A_tensor if B_tensor.shape[0] == 1 else B_tensor - A_tensor
    elif similarity_type == 'cosine':
        A_norm = torch.nn.functional.normalize(A_tensor, p=2, dim=1)
        B_norm = torch.nn.functional.normalize(B_tensor, p=2, dim=1)
        cosine_sim = torch.matmul(A_norm, B_norm.T).view(-1, 1) if B_norm.shape[0] == 1 else torch.sum(A_norm * B_norm, dim=1, keepdim=True)
        deltas = cosine_sim
    else:
        raise ValueError("Unknown similarity type")

    return deltas