import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.optim.optimizer import Optimizer
import torch.distributed as dist
import time

import numpy as np

class GossipOptimizer(Optimizer):
    def __init__(self, params, node_id: int, graph, local_step, n_nodes, n_sampled_nodes, lr=1e-5, beta=0.9, device="cuda"):

        self.node_id = node_id
        self.device = device

        self.n_nodes = n_nodes
        self.n_sampled_nodes = n_sampled_nodes
        
        self.local_step = local_step
        self.step_counter = 0
        self.lr = lr
        
        self.rng = np.random.default_rng(0)
        self.client_sampling()
        
        defaults = dict(lr=lr, beta=beta)
        super(GossipOptimizer, self).__init__(params, defaults)

        for group in self.param_groups:
            group["momentum"] = []
            
            for p in group["params"]:
                group["momentum"].append(torch.zeros_like(p, device=self.device))


    @torch.no_grad()
    def client_sampling(self):
        self.sampled_clients = list(self.rng.choice(np.arange(0, self.n_nodes), size=self.n_sampled_nodes, replace=False))
        #print("sampled_clients", self.sampled_clients, self.node_id)
                
    @torch.no_grad()
    def step(self, closure=None):
        loss = None

        if self.node_id in self.sampled_clients:
            
            for group in self.param_groups:
                lr = group['lr']
                beta = 0.9

                
                for p, momentum in zip(group['params'], group["momentum"]):
                    momentum.data = (1 - beta) * p.grad + beta * momentum
                    p -= lr * momentum
                
        if closure is not None:
            loss = closure()

        self.step_counter += 1
        if self.step_counter % self.local_step == 0:
            self.average()
            self.client_sampling()
            
        return loss


    @torch.no_grad()
    def average(self):
        with torch.no_grad():
            for group in self.param_groups:
                for i, p in enumerate(group["params"]):
                    dist.all_reduce(p.data, op=dist.ReduceOp.SUM)
                    p /= self.n_nodes

