import datetime
import dgl
import ipdb
import errno
import numpy as np
import os
import pickle
import random
import torch

from dgl.data.utils import download, get_download_dir, _get_dgl_url
from pprint import pprint
from scipy import sparse
from scipy import io as sio

def set_random_seed(seed=0):
    """Set random seed.
    Parameters
    ----------
    seed : int
        Random seed to use
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

def mkdir_p(path, log=True):
    """Create a directory for the specified path.
    Parameters
    ----------
    path : str
        Path name
    log : bool
        Whether to print result for directory creation
    """
    try:
        os.makedirs(path)
        if log:
            print('Created directory {}'.format(path))
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path) and log:
            print('Directory {} already exists.'.format(path))
        else:
            raise

def get_date_postfix():
    """Get a date based postfix for directory name.
    Returns
    -------
    post_fix : str
    """
    dt = datetime.datetime.now()
    post_fix = '{}_{:02d}-{:02d}-{:02d}'.format(
        dt.date(), dt.hour, dt.minute, dt.second)

    return post_fix

def setup_log_dir(args, sampling=False):
    """Name and create directory for logging.
    Parameters
    ----------
    args : dict
        Configuration
    Returns
    -------
    log_dir : str
        Path for logging directory
    sampling : bool
        Whether we are using sampling based training
    """
    date_postfix = get_date_postfix()
    log_dir = os.path.join(
        args['log_dir'],
        '{}_{}'.format(args['dataset'], date_postfix))

    if sampling:
        log_dir = log_dir + '_sampling'

    mkdir_p(log_dir)
    return log_dir

# The configuration below is from the paper.
default_configure = {
    'lr': 0.005,             # Learning rate
    'num_heads': [8],        # Number of attention heads for node-level attention
    'hidden_units': 8,
    'dropout': 0.6,
    'weight_decay': 0.001,
    'num_epochs': 200,
    'patience': 100
}

sampling_configure = {
    'batch_size': 20
}

def setup(args):
    args.update(default_configure)
    set_random_seed(args['seed'])
    # args['dataset'] = 'ACMRaw' if args['hetero'] else 'ACM'
    cuda = args['cuda']
    args['device'] = f'cuda:{cuda}' if torch.cuda.is_available() else 'cpu'
    args['log_dir'] = setup_log_dir(args)
    return args

def setup_for_sampling(args):
    args.update(default_configure)
    args.update(sampling_configure)
    set_random_seed()
    # args['device'] = 'cuda:1' if torch.cuda.is_available() else 'cpu'
    cuda = args['cuda']
    args['device'] = f'cuda:{cuda}' if torch.cuda.is_available() else 'cpu'
    args['log_dir'] = setup_log_dir(args, sampling=True)
    return args

def get_binary_mask(total_size, indices):
    mask = torch.zeros(total_size)
    mask[indices] = 1
    return mask.byte()

def load_allset_hg(remove_self_loop, p2raw, dname, feature_noise = 1):
    # data_path = os.path.join(p2raw, f'{dname}_raw.pickle')
    data_path = os.path.join(p2raw, f'{dname}_noise_{feature_noise:g}_raw.pickle')

    with open(data_path, 'rb') as f:
        data = pickle.load(f)

    labels, features = torch.from_numpy(data['label'].todense()).long(), \
                       torch.from_numpy(data['feature'].todense()).float()
    num_classes = labels.shape[1]
    tmp_labels = labels.nonzero()[:, 1]
    all_labels = -1 * torch.ones(labels.shape[0], dtype = labels.dtype)
    all_labels[:len(tmp_labels)] = tmp_labels
    labels = all_labels

    if remove_self_loop:
        num_nodes = data['label'].shape[0]
        data['VEV'] = sparse.csr_matrix(data['VEV'] - np.eye(num_nodes))
        data['EVE'] = sparse.csr_matrix(data['EVE'] - np.eye(num_nodes))

    # Adjacency matrices for meta path based neighbors
    # (Mufei): I verified both of them are binary adjacency matrices with self loops
    V_g = dgl.from_scipy(data['VEV'])
    E_g = dgl.from_scipy(data['EVE'])
    V_g = dgl.add_self_loop(V_g)
    E_g = dgl.add_self_loop(E_g)

    gs = [V_g, E_g]

    train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0)
    val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0)
    test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0)

    num_nodes = V_g.number_of_nodes()
    train_mask = get_binary_mask(num_nodes, train_idx)
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    # print('dataset loaded')
    # pprint({
    #     'dataset': f'{dname}',
    #     'train': train_mask.sum().item() / num_nodes,
    #     'val': val_mask.sum().item() / num_nodes,
    #     'test': test_mask.sum().item() / num_nodes
    # })

    return gs, features, labels, num_classes, train_idx, val_idx, test_idx, \
           train_mask, val_mask, test_mask

def load_allset_hg_batch(remove_self_loop, p2raw, dname, feature_noise = 1):
    # data_path = os.path.join(p2raw, f'{dname}_noise_{feature_noise}_raw.pickle')
    data_path = os.path.join(p2raw, f'{dname}_noise_{feature_noise:g}_raw.pickle')

    with open(data_path, 'rb') as f:
        data = pickle.load(f)

    labels, features = torch.from_numpy(data['label'].todense()).long(), \
                       torch.from_numpy(data['feature'].todense()).float()
    num_classes = labels.shape[1]
    tmp_labels = labels.nonzero()[:, 1]
    all_labels = -1 * torch.ones(labels.shape[0], dtype = labels.dtype)
    all_labels[:len(tmp_labels)] = tmp_labels
    labels = all_labels

    if remove_self_loop:
        num_nodes = data['label'].shape[0]
        data['VEV'] = sparse.csr_matrix(data['VEV'] - np.eye(num_nodes))
        data['EVE'] = sparse.csr_matrix(data['EVE'] - np.eye(num_nodes))

    # # Adjacency matrices for meta path based neighbors
    # # (Mufei): I verified both of them are binary adjacency matrices with self loops
    # V_g = dgl.from_scipy(data['VEV'])
    # E_g = dgl.from_scipy(data['EVE'])
    # V_g = dgl.add_self_loop(V_g)
    # E_g = dgl.add_self_loop(E_g)

    # gs = [V_g, E_g]
    v_vs_e = data['VvsE']
    e_vs_v = data['EvsV']

    num_v, num_e = v_vs_e.shape
    append2v_vs_e = sparse.csr_matrix((num_e, num_e), dtype = v_vs_e.dtype)
    append2e_vs_v = sparse.csr_matrix((num_v, num_v), dtype = e_vs_v.dtype)

    v_vs_e = sparse.vstack((v_vs_e, append2v_vs_e))
    e_vs_v = sparse.vstack((append2e_vs_v, e_vs_v))

    hg = dgl.heterograph({
            ('appended_vertex', 'Vs_E', 'hyperedge'): v_vs_e.nonzero(),
            ('hyperedge', 'E_Vs', 'appended_vertex'): v_vs_e.transpose().nonzero(),
            ('appended_hyperedge', 'Es_V', 'vertex'): e_vs_v.nonzero(), 
            ('vertex', 'V_Es', 'appended_hyperedge'): e_vs_v.transpose().nonzero()
            }, 
        num_nodes_dict = {
            'vertex': num_v,
            'hyperedge': num_e,
            'appended_vertex': num_v + num_e,
            'appended_hyperedge': num_v + num_e,
            }             
        )


    train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0)
    val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0)
    test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0)

    num_nodes = v_vs_e.shape[0]
    train_mask = get_binary_mask(num_nodes, train_idx)
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    # print('dataset loaded')
    # pprint({
    #     'dataset': f'{dname}',
    #     'train': train_mask.sum().item() / num_nodes,
    #     'val': val_mask.sum().item() / num_nodes,
    #     'test': test_mask.sum().item() / num_nodes
    # })

    return hg, features, labels, num_classes, train_idx, val_idx, test_idx, \
           train_mask, val_mask, test_mask

def load_acm(remove_self_loop):
    url = 'dataset/ACM3025.pkl'
    data_path = get_download_dir() + '/ACM3025.pkl'
    download(_get_dgl_url(url), path=data_path)

    with open(data_path, 'rb') as f:
        data = pickle.load(f)

    labels, features = torch.from_numpy(data['label'].todense()).long(), \
                       torch.from_numpy(data['feature'].todense()).float()
    num_classes = labels.shape[1]
    labels = labels.nonzero()[:, 1]

    if remove_self_loop:
        num_nodes = data['label'].shape[0]
        data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes))
        data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes))

    # Adjacency matrices for meta path based neighbors
    # (Mufei): I verified both of them are binary adjacency matrices with self loops
    author_g = dgl.from_scipy(data['PAP'])
    subject_g = dgl.from_scipy(data['PLP'])
    gs = [author_g, subject_g]

    train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0)
    val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0)
    test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0)

    num_nodes = author_g.number_of_nodes()
    train_mask = get_binary_mask(num_nodes, train_idx)
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    print('dataset loaded')
    pprint({
        'dataset': 'ACM',
        'train': train_mask.sum().item() / num_nodes,
        'val': val_mask.sum().item() / num_nodes,
        'test': test_mask.sum().item() / num_nodes
    })

    return gs, features, labels, num_classes, train_idx, val_idx, test_idx, \
           train_mask, val_mask, test_mask

def load_acm_raw(remove_self_loop):
    assert not remove_self_loop
    url = 'dataset/ACM.mat'
    data_path = get_download_dir() + '/ACM.mat'
    download(_get_dgl_url(url), path=data_path)

    data = sio.loadmat(data_path)
    p_vs_l = data['PvsL']       # paper-field?
    p_vs_a = data['PvsA']       # paper-author
    p_vs_t = data['PvsT']       # paper-term, bag of words
    p_vs_c = data['PvsC']       # paper-conference, labels come from that

    # We assign
    # (1) KDD papers as class 0 (data mining),
    # (2) SIGMOD and VLDB papers as class 1 (database),
    # (3) SIGCOMM and MOBICOMM papers as class 2 (communication)
    conf_ids = [0, 1, 9, 10, 13]
    label_ids = [0, 1, 2, 2, 1]

    p_vs_c_filter = p_vs_c[:, conf_ids]
    p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0]
    p_vs_l = p_vs_l[p_selected]
    p_vs_a = p_vs_a[p_selected]
    p_vs_t = p_vs_t[p_selected]
    p_vs_c = p_vs_c[p_selected]

    ipdb.set_trace()
    hg = dgl.heterograph({
        ('paper', 'pa', 'author'): p_vs_a.nonzero(),
        ('author', 'ap', 'paper'): p_vs_a.transpose().nonzero(),
        ('paper', 'pf', 'field'): p_vs_l.nonzero(),
        ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero()
    })

    features = torch.FloatTensor(p_vs_t.toarray())

    pc_p, pc_c = p_vs_c.nonzero()
    labels = np.zeros(len(p_selected), dtype=np.int64)
    for conf_id, label_id in zip(conf_ids, label_ids):
        labels[pc_p[pc_c == conf_id]] = label_id
    labels = torch.LongTensor(labels)

    num_classes = 3

    float_mask = np.zeros(len(pc_p))
    for conf_id in conf_ids:
        pc_c_mask = (pc_c == conf_id)
        float_mask[pc_c_mask] = np.random.permutation(np.linspace(0, 1, pc_c_mask.sum()))
    train_idx = np.where(float_mask <= 0.2)[0]
    val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0]
    test_idx = np.where(float_mask > 0.3)[0]

    num_nodes = hg.number_of_nodes('paper')
    train_mask = get_binary_mask(num_nodes, train_idx)
    val_mask = get_binary_mask(num_nodes, val_idx)
    test_mask = get_binary_mask(num_nodes, test_idx)

    return hg, features, labels, num_classes, train_idx, val_idx, test_idx, \
            train_mask, val_mask, test_mask

def load_data(dataset, remove_self_loop=False, feature_noise = 1):

    dname_list = ['cora', 'citeseer', 'pubmed',
                  'coauthor_cora', 'coauthor_dblp',
                  'NTU2012', 'ModelNet40', 
                  'zoo', 'Mushroom', '20newsW100', 
                  'yelp', 'house-committees-100', 'walmart-trips-100']

    if dataset == 'ACM':
        return load_acm(remove_self_loop)
    elif dataset == 'ACMRaw':
        return load_acm_raw(remove_self_loop)
    elif dataset in dname_list:
        p2raw = '/data/shared/dgl_data_raw/'
        return load_allset_hg(remove_self_loop, p2raw = p2raw, 
                feature_noise = feature_noise,
                dname = dataset)
    else:
        return NotImplementedError('Unsupported dataset {}'.format(dataset))

class EarlyStopping(object):
    def __init__(self, patience=10):
        dt = datetime.datetime.now()
        self.filename = 'ES_tmp_files/early_stop_{}_{:02d}-{:02d}-{:02d}.pth'.format(
            dt.date(), dt.hour, dt.minute, dt.second)
        self.patience = patience
        self.counter = 0
        self.best_acc = None
        self.best_loss = None
        self.early_stop = False

    def step(self, loss, acc, model):
        if self.best_loss is None:
            self.best_acc = acc
            self.best_loss = loss
            self.save_checkpoint(model)
        elif (loss > self.best_loss) and (acc < self.best_acc):
            self.counter += 1
            # print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            if (loss <= self.best_loss) and (acc >= self.best_acc):
                self.save_checkpoint(model)
            self.best_loss = np.min((loss, self.best_loss))
            self.best_acc = np.max((acc, self.best_acc))
            self.counter = 0
        return self.early_stop

    def save_checkpoint(self, model):
        """Saves model when validation loss decreases."""
        torch.save(model.state_dict(), self.filename)

    def load_checkpoint(self, model):
        """Load the latest checkpoint."""
        model.load_state_dict(torch.load(self.filename))
