import time
from typing import List, Optional, Tuple

import numpy as np
from torch.utils.data import Dataset
import scipy
import networkx as nx
import gies
from numpy import linalg
import wandb
import torch.nn as nn
import torch
from scipy.optimize import linear_sum_assignment
from torch.optim import Adam
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

from .base._base_model import BaseModel


def transitive_closure(matrix, depth):
    n = len(matrix)
    reach = np.array(matrix).astype(bool)
    
    for k in range(depth):
        for i in range(n):
            for j in range(n):
                reach[i][j] = reach[i][j] or (reach[i][k] and reach[k][j])
    
    return reach.astype(int)


def _compute_scores(data, masks, d):
    obs_indices = np.where(masks.all(axis=1))[0]
    print(len(obs_indices))
    score_matrix = np.zeros((d, d))
    for node in range(d):
        for var in range(d):
            if node != var: 
                data_obs = data[obs_indices, var]
                int_indices = np.where(1 - masks[:, node])[0]
                data_inter = data[int_indices, var]
                if len(data_inter) > 0:
                    w_dist = scipy.stats.wasserstein_distance(data_obs, data_inter)
                    score_matrix[node, var] = w_dist
    return score_matrix

def createFullyConnectedGraph(topological_order):
    n = len(topological_order)
    adj_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i + 1, n):
            adj_matrix[topological_order[i], topological_order[j]] = 1

    return adj_matrix


def _sort_ranking(score_matrix, lmbda):
    flat_array = score_matrix.flatten()
    G = nx.DiGraph()
    
    # Argsort on the flattened array
    sorted_flat_indices = np.argsort(-flat_array)

    # Mapping flat indices back to (i, j) format
    rows, cols = score_matrix.shape
    G.add_nodes_from(range(cols))
    sorted_indices_ij = np.unravel_index(sorted_flat_indices, (rows, cols))
    to_remove = set()
    for k in range(len(sorted_indices_ij[0])):
        i, j = sorted_indices_ij[0][k], sorted_indices_ij[1][k]
        if i != j:
            score = score_matrix[i, j]
            if score > lmbda: 
                G.add_edge(i, j)
                if not nx.is_directed_acyclic_graph(G):
                    G.remove_edge(i, j)
                    to_remove.add((i, j))
                
    return G, to_remove

def score_ordering(topological_order, score_matrix, d, eps=0.3):
    """ Score an causal order based on the observed distances"""
    tot = 0
    after = list(range(d))
    for i in topological_order:
        after.remove(i)
        if np.any(score_matrix[i, :] != 0.0):
            positive = np.sum(score_matrix[i, after] - eps)
            tot += positive
    return tot

"""MIT License

    Copyright (c) Anonymized

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE"""
def fill_triangular(vec: torch.Tensor, d, upper: bool = False,) -> torch.Tensor:
    """
    Args:
        vec: A tensor of shape (..., n(n-1)/2)
        upper: whether to fill the upper or lower triangle
    Returns:
        An array of shape (..., n, n), where the strictly upper (lower) triangle is filled from vec
        with zeros elsewhere
    """
    num_nodes = d
    if upper:
        idxs = torch.triu_indices(num_nodes, num_nodes, offset=1, device=vec.device)
    else:
        idxs = torch.tril_indices(num_nodes, num_nodes, offset=-1, device=vec.device)
    output = torch.zeros(vec.shape[:-1] + (num_nodes, num_nodes), device=vec.device)
    output[..., idxs[0, :], idxs[1, :]] = vec
    return output

def compute_perm_matrix(p: torch.Tensor, d: int, sinkhorn_n_iter: int = 300, t: float = 0.5):
    def log_sinkhorn_norm(log_alpha: torch.Tensor, tol= 1e-3):
        for i in range(sinkhorn_n_iter):
            log_alpha = log_alpha - torch.logsumexp(log_alpha, -1, keepdim=True)
            log_alpha = log_alpha - torch.logsumexp(log_alpha, -2, keepdim=True)
            exp_log_alpha = log_alpha.exp()
            if torch.abs(1.-exp_log_alpha.sum(-1)).max()<tol and torch.abs(1.-exp_log_alpha.sum(-2)).max()<tol:
                print(i)
                break
        return log_alpha.exp()
    
    o_scale = 1
    O = o_scale * torch.arange(1, d+1, dtype=p.dtype).expand(1, -1).to(p.device)
    X = torch.matmul(p.unsqueeze(-1), O.unsqueeze(-2))

    perm = log_sinkhorn_norm(X / t)

    perm_matrix = torch.zeros_like(perm)
    for i in range(perm.shape[0]):
        row_ind, col_ind = linear_sum_assignment(-perm[i].squeeze().cpu().detach().numpy())
        perm_indices = list(zip(row_ind, col_ind))            
        perm_indices = [(i,) + idx for idx in perm_indices]
        perm_indices = tuple(zip(*perm_indices))
        perm_matrix[perm_indices] = 1.0
    perm_matrix_hard = (perm_matrix - perm).detach() + perm

    full_lower = torch.ones(1, int((d - 1) * d / 2)).to(p.device)
    full_lower = fill_triangular(full_lower, d, upper=True)
    mask_matrix = full_lower
    adj_matrix = torch.matmul(
        torch.matmul(perm_matrix_hard, mask_matrix), perm_matrix_hard.transpose(-1, -2)
    )

    return adj_matrix, perm



class Linear(nn.Module):
    def __init__(self, d, lambda1, lambda_int, intervention_mask=None,):  
        super().__init__()
        self.d = d
        self.lambda1 = lambda1
        self.lambda_int = lambda_int
        self.fc = torch.nn.Linear(self.d, self.d, bias=False)
        self.intervention_mask = intervention_mask
        self.perm_matrix = None

    def postprocess_A(self):
        A = self.fc.weight.T 
        A_est = torch.where(torch.abs(A) > 0.05, A, torch.tensor(0.0, device=A.device))  
        return A_est.detach().cpu().numpy()

    def l1_reg(self):
        A = self.fc.weight
        return torch.sum(torch.abs(A)) #+ self.d

    def forward(self, X, perm_matrix, batch_indices):
        if perm_matrix.dim() == 3 and perm_matrix.shape[0] == 1:
            perm_matrix = perm_matrix.squeeze(0)
        
        if perm_matrix is not None:
            A = self.fc.weight * perm_matrix.T
            self.perm_matrix = perm_matrix
        else:
            A = self.fc.weight
        input_batch = X[batch_indices, :]
        output = input_batch.mm(A.T)
        if self.intervention_mask is not None:
            output = output * self.intervention_mask[batch_indices, :]
        return output


def constrain_loss(d, n_iter_sinkhorn, t_sinkhorn, p, score_matrix_torch):
    sig_p = (torch.sigmoid(p) * 2) - 1
    perm_matrix, perm = compute_perm_matrix(sig_p, d, sinkhorn_n_iter=n_iter_sinkhorn, t=t_sinkhorn)
    score = perm_matrix * score_matrix_torch
    loss = -torch.sum(torch.mean(score, dim=0))
    full_lower = torch.ones(1, int((d - 1) * d / 2)).to(p.device)
    full_lower = fill_triangular(full_lower, d, upper=True)
    
    return loss, perm_matrix


def causal_discovery_training(X, lambda1, lambda_int, d, score_matrix, interventions, init_ordering=None, scaling=0.1, n_iter=100, lr_int=0.001, n_iter_sinkhorn=300, t_sinkhorn=0.5, eps=0.3, intervention_mask=None):
    p_scale = 0.001
    p = p_scale * torch.randn((d), device=device)
    if init_ordering is not None:
        # Sort init_ordering to get indices that would sort it
        _, indices = torch.sort(torch.tensor(init_ordering), descending=False)
        # Sort p in descending order
        p_sorted, _ = torch.sort(p, descending=False)
        # Reorder p_sorted according to the indices from the sorted init_ordering
        p = p_sorted[indices]
    p.requires_grad = True
    p_opt = Adam([p],
                 lr=lr_int,
                 betas=(0.9, 0.99),
                 )

    score_matrix[score_matrix > 0.0] -= eps
    transitive = transitive_closure(score_matrix > 0.0, depth=d)
    score_matrix[transitive > 0.0] = scaling * d
    score_matrix_torch = torch.tensor(score_matrix).to(device)
    X_numpy = X.copy()
    X = torch.tensor(X, dtype=torch.float32).to(device)  # Corrected tensor creation
    
    if intervention_mask is not None:
        intervention_mask = torch.tensor(intervention_mask, dtype=torch.float32).to(device)

    score_matrix_torch = torch.tensor(score_matrix, dtype=torch.float32).to(device)
    envs = len(np.unique(interventions))
    N = X.shape[0]

    model = Linear(d, lambda1=lambda1, lambda_int=lambda_int, intervention_mask=intervention_mask)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    early_stop = 100
    best_loss = (float('inf'), float('inf'))  

    n_batches = 3

    batch_size = max(N // n_batches, 1) 
    lambda_ = 0.5
 
    for i in range(n_iter // 2):
        a = 0
        total_loss = 0
        indices = torch.randperm(N)
            
        while a < N:
            b = min(N, a + batch_size)
            batch_indices = indices[a:b] 
            optimizer.zero_grad()
            output = model(X, (torch.ones(d, d) - torch.eye(d)).to(device), batch_indices)

            input_batch = X[batch_indices]
            inter_batch = interventions[batch_indices]
            inter_batch = torch.tensor(inter_batch, dtype=torch.long, device=device) 
            input_batch = input_batch * intervention_mask[batch_indices, :]
            unique_interventions = torch.unique(inter_batch)
            mse_env_list = []
            non_baseline_envs = [e for e in unique_interventions if e != -1]

            # Compute baseline loss
            mask = inter_batch == -1
            inputs_env = input_batch[mask]
            outputs_env = output[mask]
            if inputs_env.numel() > 0:
                mse_baseline = torch.mean(torch.abs(inputs_env - outputs_env), axis=0)
                mse_baseline *= (1 - lambda_ + (lambda_ / envs))
                mse_env_list.append(mse_baseline)

            for intervention in non_baseline_envs:
                mask = inter_batch == intervention
                inputs_env = input_batch[mask]
                outputs_env = output[mask]
                if inputs_env.numel() > 0:
                    mse_env = torch.mean(torch.abs(inputs_env - outputs_env), axis=0) 
                    mse_env *= lambda_ / envs
                    mse_env_list.append(mse_env)

            if mse_env_list:
                loss_mse = torch.sum(torch.stack(mse_env_list), axis = 0)
                loss_mse = torch.mean(loss_mse)

            loss = (10) * loss_mse + lambda1 * model.l1_reg() 
            loss.backward()

            optimizer.step()
            

            a += batch_size
            total_loss += loss.item()
        if i % 10 == 0:
            print("Epoch: {}. Loss = {:.3f}".format(i, total_loss))

    A = model.postprocess_A() 
    print("Number of proposed edges is = {}".format(np.count_nonzero(A)))
    for i in range(n_iter):
        a = 0
        total_loss = 0
        total_mse = 0
        indices = torch.randperm(N)
        p_opt.zero_grad()

        if lambda_int <= 0.0:
            perm_matrix = torch.ones((d, d), dtype=p.dtype).to(p.device)
            constraint_loss =  torch.tensor(0)
        else:
            constraint_loss, perm_matrix = constrain_loss(d, n_iter_sinkhorn, t_sinkhorn, p, score_matrix_torch)
        
        while a < N:
            b = min(N, a + batch_size)
            batch_indices = indices[a:b]
            optimizer.zero_grad()
            p_opt.zero_grad()
            output = model(X, perm_matrix, batch_indices)

            input_batch = X[batch_indices]
            inter_batch = interventions[batch_indices]
            inter_batch = torch.tensor(inter_batch, dtype=torch.long, device=device)  
            inter_batch = torch.tensor(inter_batch, dtype=torch.long, device=device)  
            input_batch = input_batch * intervention_mask[batch_indices, :]
            
            unique_interventions = torch.unique(inter_batch)
            mse_env_list = []
            non_baseline_envs = [e for e in unique_interventions if e != -1]

            # Compute baseline loss
            mask = inter_batch == -1
            inputs_env = input_batch[mask]
            outputs_env = output[mask]
            if inputs_env.numel() > 0:
                mse_baseline = torch.mean(torch.abs(inputs_env - outputs_env), axis=0)
                mse_baseline *= (1 - lambda_ + (lambda_ / envs))
                mse_env_list.append(mse_baseline)

            # Compute loss for other environments
            for intervention in non_baseline_envs:
                mask = inter_batch == intervention
                inputs_env = input_batch[mask]
                outputs_env = output[mask]
                if inputs_env.numel() > 0:
                    mse_env = torch.mean(torch.abs(inputs_env - outputs_env), axis=0)
                    mse_env *= lambda_ / envs
                    mse_env_list.append(mse_env)

            if mse_env_list:
                loss_mse = torch.sum(torch.stack(mse_env_list), axis = 0) 
                loss_mse = torch.mean(loss_mse)
            loss = (10) * loss_mse + lambda1 * model.l1_reg() 
            if lambda_int > 0.0:
                loss += lambda_int * constraint_loss 
            loss.backward(retain_graph=True)

            optimizer.step()
            p_opt.step()
            total_mse += loss_mse.item()
            a += batch_size
            total_loss += loss.item()

        if i % 10 == 0:
            print("Epoch: {}. Loss = {:.3f}".format(i, total_loss))
            print("Epoch: {}. MSE Loss = {:.3f}".format(i, total_mse))
            print("Epoch: {}. Cons Loss = {:.3f}".format(i, constraint_loss))

        if constraint_loss.item() < best_loss[0] or (constraint_loss.item() <= best_loss[0] and total_mse < best_loss[1]):
            early_stop = 100
            best_loss = (constraint_loss.item(), total_mse)
            best_A = A = model.postprocess_A()
            best_p = p.detach().cpu().numpy()
        else:
            early_stop -= 1

        if early_stop <= 0 and i > 500:
            break

    print(model.fc.weight.max(), model.fc.weight.min())
    print(model.fc.weight)
    A = model.postprocess_A() 
    print("Number of proposed edges is = {}".format(np.count_nonzero(A)))

    return best_A, best_p

def causal_discovery(
        X: np.array,
        interventions_mask,
        score_matrix,
        eps, 
        config,
        init_ordering = None,
        lambda_int = 100.0,
    ) -> List[Tuple]:
    X = X.astype(np.float32)

    d = X.shape[1]
    interventions = []

    for i in range(X.shape[0]):  
        ind = np.where(1 - interventions_mask[i])[0]
        if len(ind) > 0:
            interventions.append(ind[0])
        else:
            interventions.append(-1) 

    W_est, p_est = causal_discovery_training(X, lambda1=0.01, lambda_int=lambda_int, d=d, score_matrix=score_matrix, interventions=np.array(interventions), intervention_mask=interventions_mask, init_ordering=init_ordering, scaling=config[d]["scaling"], n_iter=2000, lr_int=config[d]["lr"], t_sinkhorn = 0.05, n_iter_sinkhorn=500, eps=eps,)
    parents, children = np.nonzero(np.abs(W_est) > 0)
    W_est = np.zeros((d, d))
    edges = set()
    for i in range(len(parents)):
        if p_est[parents[i]] < p_est[children[i]]:
            edges.add((parents[i], children[i]))
            W_est[parents[i], children[i]] = 1
    return list(edges), np.abs(W_est) > 0.0 



class DiffIntersortCons(BaseModel):
    def __init__(self):
        super().__init__()
        self._adj_matrix = None
        self.config = {
            3: {
                "lr": 0.05,
            },
            10: {
                "lr": 0.005,
                "scaling": 0.1,
            },
            30: {
                "lr": 0.001,
                "scaling": 0.5,
            },
            100: {
                "lr": 0.0001,
                "scaling": 1.0,
            },
            500: {
                "lr": 0.00005,
                "scaling": 10.0,
            },
            1000: {
                "lr": 0.0005,
                "scaling": 1.0,
            },
            2000: {
                "lr": 0.0001,
                "scaling": 1.0,
            }
        }


    def train(
        self,
        dataset: Dataset,
        log_wandb: bool = False,
        wandb_project: str = "diffintersort_cons",
        wandb_config_dict: Optional[dict] = None,
        **model_kwargs,
    ):
        data = dataset.tensors[0].numpy()
        gies.np.bool = bool

        if log_wandb:
            wandb_config_dict = wandb_config_dict or {}
            wandb.init(
                project=wandb_project,
                name="DiffIntersortCons",
                config=wandb_config_dict,
            )

        intervention_mask = dataset.tensors[1].numpy()
        intervention_strings = np.array(
            ["".join(map(str, row)) for row in intervention_mask]
        )
        
        data_envs = []
        intervention_list = []
        for intervention_id in list(set(intervention_strings)):
            intervention_idxs = np.where(intervention_strings == intervention_id)[0]
            data_envs.append(data[intervention_idxs])
            intervention_list.append(
                list(np.where(1 - intervention_mask[intervention_idxs[0]])[0])
            )
        start = time.time()
        obs_indices = np.where(intervention_mask.all(axis=1))[0]
        data_obs = data[obs_indices, :]
        d = data_obs.shape[1]
        score_matrix = _compute_scores(data, intervention_mask, d)
        lmbda = {10: 0.3, 30: 0.3, 100: 0.3}
        
        pred_sort_ranking, to_remove = _sort_ranking(score_matrix, lmbda[d])
        topological_order_sortranking = list(nx.topological_sort(pred_sort_ranking)) 
        score_matrix[score_matrix > lmbda[d]] =  self.config[d]["scaling"] * d 
        score_matrix[(score_matrix < lmbda[d]) & (score_matrix > 0.0)] = 0.1
            
        causal_disco_graph, self._adj_matrix = causal_discovery(data, intervention_mask, score_matrix, lmbda[d], self.config, init_ordering=topological_order_sortranking)
        self._train_runtime_in_sec = time.time() - start

    def get_adjacency_matrix(self, threshold: bool = True) -> np.ndarray:
        return self._adj_matrix
