import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.optim.optimizer import Optimizer
import torch.distributed as dist
import time
import random


class GEclOptimizer(Optimizer):
    def __init__(self, params, node_id: int, adj_node_ids: list, lr=1e-5, total_alpha=0., device="cuda"):
        self.lr = lr
        self.node_id = node_id
        self.adj_node_ids = adj_node_ids
        self.device = device
        
        defaults = dict(lr=lr, total_alpha=total_alpha, alpha_ij=total_alpha/len(adj_node_ids))

        super(GEclOptimizer, self).__init__(params, defaults)

        # generate initial dual variables.
        for group in self.param_groups:
            group["c"] = []
            group["tilde_params"] = [] #\tilde{x}
            group["dual_c"] = []

            group["adj_c"] = []
            group["adj_params"] = []
            group["adj_tilde_params"] = []

            for p in group["params"]:
                adj_c = {}
                adj_params = {}
                adj_tilde_params = {}

                for adj_node_id in adj_node_ids:
                    adj_params[adj_node_id] = p.clone().detach() #torch.zeros_like(p, device=self.device)
                    adj_tilde_params[adj_node_id] = p.clone().detach() #torch.zeros_like(p, device=self.device)
                    adj_c[adj_node_id] = torch.zeros_like(p, device=self.device) # dual_c - grad

                group["c"].append(torch.zeros_like(p, device=self.device))
                group["tilde_params"].append(p.clone().detach())
                #group["tilde_params"].append(torch.zeros_like(p, device=self.device))
                group["dual_c"].append(torch.zeros_like(p, device=self.device))
                group["adj_c"].append(adj_c)
                group["adj_params"].append(adj_params)
                group["adj_tilde_params"].append(adj_tilde_params)

        # generate A_{i|j}
        self.state["A"] = {}
        for adj_node_id in adj_node_ids:
            if self.node_id < adj_node_id:
                self.state["A"][adj_node_id] = 1.0
            else:
                self.state["A"][adj_node_id] = -1.0


    @torch.no_grad()
    def initialize(self):
        """
        for node_id in self.adj_node_ids:
            if self.node_id < node_id:
                self.send_param(node_id)
                self.recv_param(node_id)
            else:
                self.recv_param(node_id)
                self.send_param(node_id)
        """
        pass

    @torch.no_grad()
    def step(self, closure=None):
        loss = None

        # Computes average with its neighbors.
        self.average_param()

        for group in self.param_groups:

            lr = group['lr']

            for p, tilde_p, dual_c in zip(group['params'], group["tilde_params"], group["dual_c"]):
                """
                if self.node_id == 0:
                print("dual_c", dual_c)
                """
                p.data = tilde_p.data - lr * (p.grad.data - dual_c.data)


        if closure is not None:
            loss = closure()

        self.update(self.adj_node_ids)

        return loss


    @torch.no_grad()
    def update(self, node_ids):
        for node_id in node_ids:
            if self.node_id < node_id:
                self.send_dual(node_id)
                self.recv_dual(node_id)
                self.send_param(node_id)
                self.recv_param(node_id)
            else:
                self.recv_dual(node_id)
                self.send_dual(node_id)
                self.recv_param(node_id)
                self.send_param(node_id)

        self.update_dual()

    @torch.no_grad()
    def send_param(self, node_id):
        for group in self.param_groups:
            for i, (p, tilde_p) in enumerate(zip(group["params"], group["tilde_params"])):
                dist.send(tensor=p.data.to("cpu"), dst=node_id, tag=i)
                dist.send(tensor=tilde_p.data.to("cpu"), dst=node_id, tag=i)


    @torch.no_grad()
    def recv_param(self, node_id):
        for group in self.param_groups:
            for i, (adj_p, adj_tilde_p) in enumerate(zip(group["adj_params"], group["adj_tilde_params"])):
                tmp = torch.zeros_like(adj_p[node_id], device="cpu")
                dist.recv(tensor=tmp, src=node_id, tag=i)
                adj_p[node_id].data = tmp.data.to(self.device)

                tmp2 = torch.zeros_like(adj_tilde_p[node_id], device="cpu")
                dist.recv(tensor=tmp2, src=node_id, tag=i)
                adj_tilde_p[node_id].data = tmp2.data.to(self.device)


    @torch.no_grad()
    def average_param(self):
        for group in self.param_groups:
            for i, (p, tilde_p, adj_p) in enumerate(zip(group["params"], group["tilde_params"], group["adj_params"])):

                tilde_p.data = p.clone().detach()
                for node_id in self.adj_node_ids:
                    tilde_p.data += adj_p[node_id]
                tilde_p.data /= (len(self.adj_node_ids) + 1)
                #tilde_p.data = p.data.clone().detach()

    @torch.no_grad()
    def send_dual(self, node_id):
        """
        Send dual_c - grad.
        """
        for group in self.param_groups:
            for i, (p, dual_c) in enumerate(zip(group["params"], group["dual_c"])):
                dist.send(tensor=(dual_c - p.grad).to("cpu"), dst=node_id, tag=i)


    @torch.no_grad()
    def recv_dual(self, node_id):
        """
        Receive dual_c - grad, and store it.
        """
        for group in self.param_groups:
            for i, adj_c in enumerate(group["adj_c"]):
                tmp = torch.zeros_like(adj_c[node_id], device="cpu")
                dist.recv(tensor=tmp, src=node_id, tag=i)
                adj_c[node_id].data = tmp.to(self.device)


    @torch.no_grad()
    def update_dual(self):
        for group in self.param_groups:
            alpha_ij = group['alpha_ij']

            for i, (p, dual_c, adj_c, tilde_p, tilde_adj_p) in enumerate(zip(group["params"], group["dual_c"], group["adj_c"], group["tilde_params"], group["adj_tilde_params"])):

                tmp = torch.zeros_like(dual_c)
                for node_id in self.adj_node_ids:
                    tmp += adj_c[node_id] # adj_c[node_id] is dual_c - grad.
                tmp += dual_c - p.grad

                tmp2 = torch.zeros_like(tilde_p)
                for node_id in self.adj_node_ids:
                    tmp2 += alpha_ij * (tilde_adj_p[node_id] - tilde_p)

                dual_c.data = tmp/(1 + len(self.adj_node_ids)) + p.grad + tmp2/2


    @torch.no_grad()
    def param_diff(self):
        diff = 0.
        for group in self.param_groups:
            for i, (p, adj_p) in enumerate(zip(group["params"], group["adj_params"])):
                for node_id in self.adj_node_ids:
                    diff += torch.norm(p - adj_p[node_id]).detach().cpu()
        return diff
