import time
from typing import Optional

import numpy as np
from torch.utils.data import Dataset
import scipy
import networkx as nx
import gies
from numpy import linalg
import wandb
import torch
from scipy.optimize import linear_sum_assignment
from torch.optim import Adam
import random

from .base._base_model import BaseModel


def transitive_closure(matrix, depth):
    n = len(matrix)
    reach = np.array(matrix).astype(bool)
    
    for k in range(depth):
        for i in range(n):
            for j in range(n):
                reach[i][j] = reach[i][j] or (reach[i][k] and reach[k][j])
    
    return reach.astype(int)


def _compute_scores(data, masks, d):
    obs_indices = np.where(masks.all(axis=1))[0]
    print(len(obs_indices))
    score_matrix = np.zeros((d, d))
    for node in range(d):
        for var in range(d):
            if node != var: 
                data_obs = data[obs_indices, var]
                int_indices = np.where(1 - masks[:, node])[0]
                data_inter = data[int_indices, var]
                if len(data_inter) > 0:
                    w_dist = scipy.stats.wasserstein_distance(data_obs, data_inter)
                    score_matrix[node, var] = w_dist
    return score_matrix

def createFullyConnectedGraph(topological_order):
    n = len(topological_order)
    adj_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i + 1, n):
            adj_matrix[topological_order[i], topological_order[j]] = 1

    return adj_matrix


def _sort_ranking(score_matrix, lmbda):
    flat_array = score_matrix.flatten()
    G = nx.DiGraph()
    
    # Argsort on the flattened array
    sorted_flat_indices = np.argsort(-flat_array)

    # Mapping flat indices back to (i, j) format
    rows, cols = score_matrix.shape
    G.add_nodes_from(range(cols))
    sorted_indices_ij = np.unravel_index(sorted_flat_indices, (rows, cols))
    to_remove = set()
    for k in range(len(sorted_indices_ij[0])):
        i, j = sorted_indices_ij[0][k], sorted_indices_ij[1][k]
        if i != j:
            score = score_matrix[i, j]
            if score > lmbda: 
                G.add_edge(i, j)
                if not nx.is_directed_acyclic_graph(G):
                    G.remove_edge(i, j)
                    to_remove.add((i, j))
                
    return G, to_remove

def score_ordering(topological_order, score_matrix, d, eps=0.3):
    """ Score an causal order based on the observed distances"""
    tot = 0
    after = list(range(d))
    for i in topological_order:
        after.remove(i)
        if np.any(score_matrix[i, :] != 0.0):
            positive = np.sum(score_matrix[i, after] - eps)
            tot += positive
    return tot

"""MIT License

    Copyright (c) Anonymized

    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:

    The above copyright notice and this permission notice shall be included in all
    copies or substantial portions of the Software.

    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
    SOFTWARE"""
def fill_triangular(vec: torch.Tensor, d, upper: bool = False,) -> torch.Tensor:
    """
    Args:
        vec: A tensor of shape (..., n(n-1)/2)
        upper: whether to fill the upper or lower triangle
    Returns:
        An array of shape (..., n, n), where the strictly upper (lower) triangle is filled from vec
        with zeros elsewhere
    """
    num_nodes = d
    if upper:
        idxs = torch.triu_indices(num_nodes, num_nodes, offset=1, device=vec.device)
    else:
        idxs = torch.tril_indices(num_nodes, num_nodes, offset=-1, device=vec.device)
    output = torch.zeros(vec.shape[:-1] + (num_nodes, num_nodes), device=vec.device)
    output[..., idxs[0, :], idxs[1, :]] = vec
    return output

def compute_perm_matrix(p: torch.Tensor, d: int, sinkhorn_n_iter: int = 300, t: float = 0.5):
    def log_sinkhorn_norm(log_alpha: torch.Tensor, tol= 1e-3):
        for i in range(sinkhorn_n_iter):
            log_alpha = log_alpha - torch.logsumexp(log_alpha, -1, keepdim=True)
            log_alpha = log_alpha - torch.logsumexp(log_alpha, -2, keepdim=True)
            exp_log_alpha = log_alpha.exp()
            if torch.abs(1.-exp_log_alpha.sum(-1)).max()<tol and torch.abs(1.-exp_log_alpha.sum(-2)).max()<tol:
                print(i)
                break
        return log_alpha.exp()
    
    o_scale = 1
    O = o_scale * torch.arange(1, d+1, dtype=p.dtype).expand(1, -1).to(p.device)
    X = torch.matmul(p.unsqueeze(-1), O.unsqueeze(-2))

    perm = log_sinkhorn_norm(X / t)

    perm_matrix = torch.zeros_like(perm)
    for i in range(perm.shape[0]):
        row_ind, col_ind = linear_sum_assignment(-perm[i].squeeze().cpu().detach().numpy())
        perm_indices = list(zip(row_ind, col_ind))            
        perm_indices = [(i,) + idx for idx in perm_indices]
        perm_indices = tuple(zip(*perm_indices))
        perm_matrix[perm_indices] = 1.0
    perm_matrix_hard = (perm_matrix - perm).detach() + perm

    full_lower = torch.ones(1, int((d - 1) * d / 2)).to(p.device)
    full_lower = fill_triangular(full_lower, d, upper=True)
    mask_matrix = full_lower
    adj_matrix = torch.matmul(
        torch.matmul(perm_matrix_hard, mask_matrix), perm_matrix_hard.transpose(-1, -2)
    )

    return adj_matrix, perm

def diffintersort(score_matrix, d, init_ordering = None, scaling=0.1, n_iter=100, lr=0.001, n_iter_sinkhorn = 300, t_sinkhorn = 0.5, eps=0.3,):
    p_scale = 0.001
    device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device("cpu")
    p = p_scale * torch.randn((d), device=device)
    if init_ordering is not None:
        # Sort init_ordering to get indices that would sort it
        _, indices = torch.sort(torch.tensor(init_ordering), descending=False)
        
        # Sort p in descending order
        p_sorted, _ = torch.sort(p, descending=False)
        
        # Reorder p_sorted according to the indices from the sorted init_ordering
        p = p_sorted[indices]
    p.requires_grad = True
    p_opt = Adam([p],
                lr=lr,
                betas=(0.9,0.99),
                )
    
    score_matrix[score_matrix > 0.0] -= eps

    transitive = transitive_closure(score_matrix > 0.0, depth=d)
    score_matrix[transitive > 0.0] = scaling * d 

    
    scoring_matrix = score_matrix.copy()
    score_matrix_torch = torch.tensor(score_matrix).to(device)
    p_cpu = p.detach().cpu().numpy()
    new_ordering = np.argsort(p_cpu)
    score_new_ordering = score_ordering(new_ordering, scoring_matrix, d, eps)
    best_score = score_new_ordering
    best = p_cpu
    best, perm = opt_step(scoring_matrix, d, n_iter, n_iter_sinkhorn, t_sinkhorn, p, p_opt, score_matrix_torch, best_score)
    return np.argsort(best), perm

def opt_step(score_matrix, d, n_iter, n_iter_sinkhorn, t_sinkhorn, p, p_opt, score_matrix_torch, best_score, early_stop_steps=500):
    best = p.detach().cpu().numpy()
    batch_size = 1
    early_stop_count = 500 
    if d > 30:
        early_stop_count *= 2
    for i in range(n_iter):
        s = d 
        
        p_opt.zero_grad()
        loss = torch.tensor(0, dtype=p.dtype).to(p.device)
        
        
        for _ in range(batch_size):
            sig_p = (torch.sigmoid(p) * 2) - 1
            perm_matrix, perm = compute_perm_matrix(sig_p, s, n_iter_sinkhorn, t_sinkhorn)
            score = perm_matrix * score_matrix_torch
            loss += -torch.sum(torch.mean(score, dim=0)) 

        loss.backward()
        p_opt.step()
        
        p_cpu = p.detach().cpu().numpy()
        new_ordering = np.argsort(p_cpu)
        score_new_ordering = score_ordering(new_ordering, score_matrix, d, 0)
        print(f"New score at iter {i}: {score_new_ordering}")
        if score_new_ordering > best_score:
            best = p_cpu.copy()
            best_score = score_new_ordering
            early_stop_count = early_stop_steps
            if d > 30:
                early_stop_count *= 2
        else:
            early_stop_count -= 1
        if score_new_ordering == best_score:
            best = p_cpu.copy()
        if early_stop_count <= 0:
            break
    return best, perm.detach().cpu()[0]

    
class DiffIntersort(BaseModel):
    def __init__(self):
        super().__init__()
        self._adj_matrix = None
        self.config = {
            3: {
                "lr": 0.05,
            },
            10: {
                "lr": 0.005,
                "scaling": 0.1,
            },
            30: {
                "lr": 0.001,
                "scaling": 0.5,
            },
            100: {
                "lr": 0.0001,
                "scaling": 1.0,
            },
            500: {
                "lr": 0.00005,
                "scaling": 10.0,
            },
            1000: {
                "lr": 0.0005,
                "scaling": 1.0,
            },
            2000: {
                "lr": 0.0001,
                "scaling": 1.0,
            }
        }


    def train(
        self,
        dataset: Dataset,
        log_wandb: bool = False,
        wandb_project: str = "diffintersort",
        wandb_config_dict: Optional[dict] = None,
        **model_kwargs,
    ):
        data = dataset.tensors[0].numpy()
        gies.np.bool = bool

        if log_wandb:
            wandb_config_dict = wandb_config_dict or {}
            wandb.init(
                project=wandb_project,
                name="DiffIntersort",
                config=wandb_config_dict,
            )

        intervention_mask = dataset.tensors[1].numpy()
        intervention_strings = np.array(
            ["".join(map(str, row)) for row in intervention_mask]
        )
        
        data_envs = []
        intervention_list = []
        for intervention_id in list(set(intervention_strings)):
            intervention_idxs = np.where(intervention_strings == intervention_id)[0]
            data_envs.append(data[intervention_idxs])
            intervention_list.append(
                list(np.where(1 - intervention_mask[intervention_idxs[0]])[0])
            )
        start = time.time()
        obs_indices = np.where(intervention_mask.all(axis=1))[0]
        data_obs = data[obs_indices, :]
        d = data_obs.shape[1]
        score_matrix = _compute_scores(data, intervention_mask, d)
        lmbda = {10: 0.3, 30: 0.3, 100: 0.3}
        
        pred_sort_ranking, to_remove = _sort_ranking(score_matrix, lmbda[d])
        topological_order_sortranking = list(nx.topological_sort(pred_sort_ranking)) 
        score_matrix[score_matrix > lmbda[d]] =  self.config[d]["scaling"] * d 
        score_matrix[(score_matrix < lmbda[d]) & (score_matrix > 0.0)] = 0.1
            
        pred_diffintersort, perm_matrix = diffintersort(score_matrix.copy(), d, topological_order_sortranking, scaling=self.config[d]["scaling"], n_iter=10000, lr=self.config[d]["lr"], t_sinkhorn = 0.05, n_iter_sinkhorn=500, eps=lmbda[d])
        self._adj_matrix = createFullyConnectedGraph(pred_diffintersort)
        self._train_runtime_in_sec = time.time() - start

    def get_adjacency_matrix(self, threshold: bool = True) -> np.ndarray:
        return self._adj_matrix
