# -*- coding: UTF-8 -*-


import torch
import torch.distributed as dist


class GatherLayer(torch.autograd.Function):
    """Gather tensors from all processes, supporting backward propagation.
    
    This is a custom PyTorch autograd function that performs an all-gather operation on input tensors, allowing for backward propagation of gradients. It is useful in multi-GPU training scenarios where tensors need to be gathered across all processes.
    
    The `forward` method gathers the input tensor from all processes and returns a tuple of the gathered tensors. The `backward` method computes the gradient with respect to the input tensor, using the gradients received from the all-gather operation.
    """



    # https://discuss.pytorch.org/t/dist-all-gather-and-gradient-preservation-in-multi-gpu-training/120696

    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)
        output = [torch.zeros_like(input) for _ in range(dist.get_world_size())]
        dist.all_gather(output, input)
        return tuple(output)

    @staticmethod
    def backward(ctx, *grads):
        (input,) = ctx.saved_tensors
        grad_out = torch.zeros_like(input)
        grad_out[:] = grads[dist.get_rank()]
        return grad_out
