from typing import List, Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.utils import add_self_loops

from greatx.nn.layers import Sequential, activations
from greatx.functional import spmm
from greatx.utils import wrapper


class MLP(nn.Module):
    """Internal MLP implementation for GRAND"""
    @wrapper
    def __init__(self, in_channels: int, out_channels: int,
                 hids: List[int] = [16], acts: List[str] = ['relu'],
                 dropout: float = 0.5, bias: bool = True, bn: bool = False):

        super().__init__()

        lin = []
        for hid, act in zip(hids, acts):
            lin.append(nn.Linear(in_channels, hid, bias=bias))
            if bn:
                lin.append(nn.BatchNorm1d(hid))
            lin.append(activations.get(act))
            lin.append(nn.Dropout(dropout))
            in_channels = hid
        lin.append(nn.Linear(in_channels, out_channels, bias=bias))
        self.lin = Sequential(*lin)

    def reset_parameters(self):
        self.lin.reset_parameters()

    def forward(self, x, *args, **kwargs):
        return self.lin(x)


def sparse_propagate(x: Tensor, edge_index: Tensor, edge_weight: Optional[Tensor], 
                    order: int, normalized: bool = True) -> Tensor:
    """Efficient sparse propagation equivalent to iterative A @ out + X"""
    if normalized:
        # Apply GCN normalization
        edge_index, edge_weight = add_self_loops(edge_index, edge_weight, 
                                                 fill_value=1.0, num_nodes=x.size(0))
        edge_index, edge_weight = gcn_norm(edge_index, edge_weight, 
                                          num_nodes=x.size(0), improved=False)
    
    out = x
    for _ in range(order):
        # Equivalent to: out = A @ out + x
        propagated = spmm(out, edge_index, edge_weight)
        out = propagated + x
    
    return out


class GRAND(nn.Module):
    r"""Graph Random Neural Network (GRAND) from the `"Graph Random Neural Network for Semi-Supervised Learning on Graphs"
    <https://arxiv.org/abs/2005.11079>`_ paper (NeurIPS'20)

    This implementation uses efficient sparse matrix operations instead of dense matrices,
    making it scalable to large graphs.

    Parameters
    ----------
    in_channels : int
        the input dimensions of model
    out_channels : int
        the output dimensions of model
    hids : List[int], optional
        the number of hidden units for each hidden layer, by default [64]
    acts : List[str], optional
        the activation function for each hidden layer, by default ['relu']
    dropout : float, optional
        the dropout ratio of model, by default 0.5
    dropnode : float, optional
        the dropnode ratio during training, by default 0.5
    order : int, optional
        the number of propagation layers, by default 2
    mlp_input_dropout : float, optional
        the dropout ratio for MLP input, by default 0.5
    bias : bool, optional
        whether to use bias in the layers, by default True

    Examples
    --------
    >>> # GRAND with default settings
    >>> model = GRAND(100, 10)

    >>> # GRAND with custom hyperparameters
    >>> model = GRAND(100, 10, hids=[64, 32], dropnode=0.3, order=3)

    Note
    ----
    GRAND requires special training procedure with consistency regularization.
    Use GRANDTrainer for training this model.
    """
    
    @wrapper
    def __init__(self, in_channels: int, out_channels: int,
                 hids: List[int] = [64], acts: List[str] = ['relu'],
                 dropout: float = 0.5, dropnode: float = 0.5, 
                 order: int = 2, mlp_input_dropout: float = 0.5,
                 bias: bool = True):
        super().__init__()
        
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.dropnode = dropnode
        self.order = order
        
        self.mlp = MLP(in_channels, out_channels, hids=hids, acts=acts, 
                      dropout=dropout, bias=bias)
        self.mlp_input_dropout = nn.Dropout(mlp_input_dropout)
        
    def reset_parameters(self):
        """Reset parameters"""
        self.mlp.reset_parameters()
    
    def forward(self, x: Tensor, edge_index: Tensor, edge_weight: Optional[Tensor] = None) -> Tensor:
        """Forward pass for GRAND
        
        Parameters
        ----------
        x : Tensor
            node features with shape [num_nodes, num_features]
        edge_index : Tensor
            edge indices with shape [2, num_edges]
        edge_weight : Optional[Tensor]
            edge weights with shape [num_edges]
            
        Returns
        -------
        Tensor
            output predictions with shape [num_nodes, num_classes]
        """
        # Feature normalization (L1 normalize)
        # x = F.normalize(x, p=1, dim=-1)
        
        # Apply random propagation with dropnode
        x_prop = self._rand_prop(x, edge_index, edge_weight)
        
        # Apply MLP with input dropout
        return self.mlp(self.mlp_input_dropout(x_prop))
    
    def _rand_prop(self, x: Tensor, edge_index: Tensor, edge_weight: Optional[Tensor]) -> Tensor:
        """Random propagation with dropnode"""
        # Apply dropnode - either mask out nodes or scale by (1-dropnode)
        if self.training:
            # During training, randomly mask out nodes
            mask = (torch.rand(x.shape[0], device=x.device) <= 1 - self.dropnode)[:, None]
            x = x * mask
        else:
            # During inference, scale by expected value
            x = x * (1 - self.dropnode)
        
        # Perform sparse propagation
        out = sparse_propagate(x, edge_index, edge_weight, self.order, normalized=True)
        
        # Average over propagation steps (including original features)
        return out / (self.order + 1) 