import dgl
import dgl.nn as dglnn
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import os
from utils.gcl_aug import aug
from utils.gcl_eval import label_classification

def load_dataset(name, run, device):
    data = np.load(os.path.join('/home/yilun/HOM_GNN/syn-heterophilous-graphs','data', f'{name.replace("-", "_")}.npz'))
    features = torch.tensor(data['node_features'])
    labels = torch.tensor(data['node_labels'])
    edges = torch.tensor(data['edges'])
    if edges.shape[0]==2: edges = edges.t()
    graph = dgl.graph((edges[:, 0], edges[:, 1]), num_nodes=len(features), idtype=torch.int)
    graph = dgl.to_bidirected(graph)
    graph = dgl.add_self_loop(graph)
    train_mask = torch.tensor(data['train_masks'])
    val_mask = torch.tensor(data['val_masks'])
    test_mask = torch.tensor(data['test_masks'])

    in_size = features.shape[1]
    out_size = len(labels.unique())

    graph = graph.to(device)
    features = features.to(device)
    labels = labels.to(device)
    masks = train_mask[run].to(device), val_mask[run].to(device), test_mask[run].to(device)

    return graph, features, labels, masks, in_size, out_size

def get_gcn_feature(name, run, device, hidden_dim=128, num_steps=1000):
    graph, features, labels, masks, in_size, out_size  =load_dataset(name, run, device)
    model = GCN(in_size, hidden_dim , out_size)
    model = model.to(device)
    print("Pretraining GCN Features For Graph Reconstruction...")
    train(graph, features, labels, masks, model, run, num_steps)
    acc = evaluate(graph, features, labels, masks[2], model)
    print("Pretraining GCN Test accuracy {:.4f}".format(acc))
    embeddings = model(graph, features)
    embeddings = embeddings.detach().cpu()
    return embeddings

def get_mlp_feature(name, run, device, hidden_dim=128, num_steps=1000):
    graph, features, labels, masks, in_size, out_size  =load_dataset(name, run, device)
    model = MLP(in_size, hidden_dim, out_size)
    model = model.to(device)
    print("Pretraining MLP Features For Graph Reconstruction...")
    train(graph, features, labels, masks, model, run, num_steps)
    acc = evaluate(graph, features, labels, masks[2], model)
    print("Pretraining GCN Test accuracy {:.4f}".format(acc))
    embeddings = model(graph, features)
    embeddings = embeddings.detach().cpu()
    return embeddings

# Do not use train() function here because GCL is more complicated
def get_grace_feature(name, run, device, hidden_dim=128, num_steps=1000):
    graph, feat, labels, masks, in_size, out_size  =load_dataset(name, run, device)
    
    model = Grace(in_size, hidden_dim, out_size, num_layers=2, act_fn=nn.ReLU(), temp=1.0)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    print("Pretraining GRACE Features For Graph Reconstruction...")
    with tqdm(total=num_steps, desc=f'Run {run}') as progress_bar:
        for epoch in range(num_steps):
            model.train()
            optimizer.zero_grad()
            graph1, feat1 = aug(graph, feat, feat_drop_rate=0.2, edge_mask_rate=0.2)
            graph2, feat2 = aug(graph, feat, feat_drop_rate=0.2, edge_mask_rate=0.2)

            graph1 = graph1.to(device)
            graph2 = graph2.to(device)

            feat1 = feat1.to(device)
            feat2 = feat2.to(device)

            loss = model(graph1, graph2, feat1, feat2)
            loss.backward()
            optimizer.step()

            progress_bar.update()
            progress_bar.set_postfix({"Epoch":"{:05d}".format(epoch),
                                      "Loss:":"{:.4f}".format(loss.item())})

    print("Pretraining GCN Test accuracy:")

    graph = graph.add_self_loop()
    graph = graph.to(device)
    feat = feat.to(device)
    embeddings = model.get_embedding(graph, feat)

    """Evaluation Embeddings  """
    label_classification(
        embeddings, labels, train_mask=masks[0], test_mask=masks[2], split="random"
    )
    embeddings = embeddings.detach().cpu()
    return embeddings
    

def evaluate(g, features, labels, mask, model):
    model.eval()
    with torch.no_grad():
        logits = model(g, features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)


def train(g, features, labels, masks, model, run, num_steps):
    # define train/val samples, loss function and optimizer
    train_mask = masks[0]
    val_mask = masks[1]
    loss_fcn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2, weight_decay=5e-4)

    # training loop
    with tqdm(total=num_steps, desc=f'Run {run}') as progress_bar:
        for epoch in range(num_steps):
            model.train()
            logits = model(g, features)
            loss = loss_fcn(logits[train_mask], labels[train_mask])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            acc = evaluate(g, features, labels, val_mask, model)
            progress_bar.update()
            progress_bar.set_postfix({"Epoch":"{:05d}".format(epoch),
                                      "Loss:":"{:.4f}".format(loss.item()),
                                      "Accuracy":"{:.4f}".format(acc)})


''' Models are from dgl examples '''

class GCN(nn.Module):
    def __init__(self, in_size, hid_size, out_size):
        super().__init__()
        self.layers = nn.ModuleList()
        # two-layer GCN
        self.layers.append(
            dglnn.GraphConv(in_size, hid_size, activation=F.relu)
        )
        self.layers.append(dglnn.GraphConv(hid_size, out_size))
        self.dropout = nn.Dropout(0.5)

    def forward(self, g, features):
        h = features
        for i, layer in enumerate(self.layers):
            if i != 0:
                h = self.dropout(h)
            h = layer(g, h)
        return h
    

class MLP(nn.Module):
    def __init__(self, in_dim, hid_size, out_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(in_dim, hid_size)
        self.fc2 = nn.Linear(hid_size, out_dim)

    def forward(self, g, x):
        z = F.elu(self.fc1(x))
        return self.fc2(z)
    

class Grace(nn.Module):
    r"""
        GRACE model
    Parameters
    -----------
    in_dim: int
        Input feature size.
    hid_dim: int
        Hidden feature size.
    out_dim: int
        Output feature size.
    num_layers: int
        Number of the GNN encoder layers.
    act_fn: nn.Module
        Activation function.
    temp: float
        Temperature constant.
    """

    def __init__(self, in_dim, hid_dim, out_dim, num_layers, act_fn, temp):
        super(Grace, self).__init__()
        self.encoder = GCN(in_dim, hid_dim, hid_dim//2)
        self.temp = temp
        self.proj = MLP(hid_dim//2, hid_dim, out_dim)

    def sim(self, z1, z2):
        # normalize embeddings across feature dimension
        z1 = F.normalize(z1)
        z2 = F.normalize(z2)

        s = torch.mm(z1, z2.t())
        return s

    def get_loss(self, z1, z2):
        # calculate SimCLR loss
        f = lambda x: torch.exp(x / self.temp)

        refl_sim = f(self.sim(z1, z1))  # intra-view pairs
        between_sim = f(self.sim(z1, z2))  # inter-view pairs

        # between_sim.diag(): positive pairs
        x1 = refl_sim.sum(1) + between_sim.sum(1) - refl_sim.diag()
        loss = -torch.log(between_sim.diag() / x1)

        return loss

    def get_embedding(self, graph, feat):
        # get embeddings from the model for evaluation
        h = self.encoder(graph, feat)

        return h.detach()

    def forward(self, graph1, graph2, feat1, feat2):
        # encoding
        h1 = self.encoder(graph1, feat1)
        h2 = self.encoder(graph2, feat2)

        # projection
        z1 = self.proj(None, h1)
        z2 = self.proj(None, h2)

        # get loss
        l1 = self.get_loss(z1, z2)
        l2 = self.get_loss(z2, z1)

        ret = (l1 + l2) * 0.5

        return ret.mean()