# coding:utf-8
import os, time, argparse, random, torch
import numpy as np
import pandas as pd
import utils
from torchvision import transforms
from model import MGPP
from torch_geometric.data import DataLoader
from ogb.graphproppred import PygGraphPropPredDataset
from utils import ASTNodeEncoder, get_vocab_mapping
from utils import augment_edge, encode_y_to_arr, decode_arr_to_seq


parser = argparse.ArgumentParser()
# Dataset
parser.add_argument('--data_path', default='../../../GraphData/', type=str,
                    help="data path (dictionary)")
parser.add_argument('--dataset', type=str, default="ogbg-code2",
                    help='dataset name (default: ogbg-code2)')
parser.add_argument('--gnn', type=str, default='pem', help='GNN gin, or gcn, or pem(default: gin)')
parser.add_argument('--feature', type=str, default="full",
                    help='full feature or simple feature')

## For neural network
parser.add_argument('--pool', type=str, default="max",
                    help='max/attention/set2set/mem')
parser.add_argument('--use_node_attr', type=bool, default=True,
                    help='True: permutation invariance model, False: permutation sensitive model')
parser.add_argument('--use_edge_attr', type=bool, default=False,
                    help='True: aggregate edge attribute for node, False: no use')
parser.add_argument('--vocab_size', type=int, default=500,
                    help='number of vocab size for permutation sensitive (default: 1000)')
parser.add_argument('--max_seq_len', type=int, default=5,
                    help='maximum sequence length to predict (default: 5)')
parser.add_argument('--num_vocab', type=int, default=5000,
                    help='the number of vocabulary used for sequence prediction (default: 5000)')
parser.add_argument('--num_layer', type=int, default=3,
                    help='number of GNN message passing layers (default: 2)')
parser.add_argument('--emb_dim', type=int, default=256, help='hidden size for node feature')

# Fro training
parser.add_argument('--batch_size', type=int, default=128, help='batch size')
parser.add_argument('--lr', default=0.0001, type=float)
parser.add_argument('--lr_decay_epoch', default=10, type=int)
parser.add_argument('--lr_decay_rate', default=0.95, type=float)
parser.add_argument('--weight_decay', type=float, default=0.0001, help='weight decay')
# parser.add_argument('--dropout', type=float, default=0.3, help='weight decay') # saved property
parser.add_argument('--epochs', type=int, default=25, help='maximum number of epochs')
parser.add_argument('--least_epoch', type=int, default=20, help='maximum number of epochs')
parser.add_argument('--early_stop', type=int, default=15, help='patience for early stopping')

parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)')
parser.add_argument('--print_freq', default=1, type=int)
parser.add_argument("--run_times", type=int, default=10, help="seed for initializing training.")
parser.add_argument('--device', default='cuda', type=str, help='use GPU.')


def main(pool):
    args = parser.parse_args()

    args.pool = pool
    # prepare related data
    dataset = PygGraphPropPredDataset(name=args.dataset, root=args.data_path)
    args.num_tasks = dataset.num_tasks # dataset.num_tasks == dataset.num_classes == 1

    if args.feature == 'full':
        pass
    elif args.feature == 'simple':
        print('using simple feature')
        # only retain the top two node/edge features
        dataset.data.x = dataset.data.x[:, :2]
        dataset.data.edge_attr = dataset.data.edge_attr[:, :2]

    seq_len_list = np.array([len(seq) for seq in dataset.data.y])
    print('Target seqence less or equal to {} is {}%.'.format(args.max_seq_len,
                                                              np.sum(seq_len_list <= args.max_seq_len) / len(
                                                                  seq_len_list)))

    split_idx = dataset.get_idx_split()

    ### building vocabulary for sequence predition. Only use training data.
    vocab2idx, idx2vocab = get_vocab_mapping([dataset.data.y[i] for i in split_idx['train']], args.num_vocab)
    args.num_vocab = len(vocab2idx)
    # add edge attr
    dataset.transform = transforms.Compose(
        [augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len)])

    train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True,
                              num_workers=args.num_workers)
    valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False,
                             num_workers=args.num_workers)

    nodetypes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz'))
    nodeattributes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz'))
    print(nodeattributes_mapping)
    ### Encoding node features into emb_dim vectors.
    ### The following three node features are used.
    # 1. node type
    # 2. node attribute
    # 3. node depth
    # node_encoder = ASTNodeEncoder(args.emb_dim, num_nodetypes=len(nodetypes_mapping['type']),
    #                               num_nodeattributes=len(nodeattributes_mapping['attr']), max_depth=20)

    # prepare related documents
    print('\nSetting environment...')

    if not os.path.exists('log'):
        os.makedirs('log')
    log_dir = 'log/' + args.dataset + '_' + args.gnn + '_' + args.pool + '_' + str(args.use_node_attr) + '_' \
              + str(args.use_edge_attr) + '_' + time.strftime("%Y-%m-%d_%H-%M-%S")
    os.makedirs(log_dir)
    utils.configure_output_dir(log_dir)

    # hyperparameters: lr, dropout, weight_decay, emb_dim, permutation
    lrs = [0.001]
    wds = [0]
    emb_dims = [256]
    combinations = [{'lr':lr, 'wd':wd, 'emb_dim':emb_dim}
                    for lr in lrs for wd in wds for emb_dim in emb_dims]

    results = []
    for combination in combinations:
        args.lr = combination['lr']
        args.weight_decay = combination['wd']
        args.emb_dim = combination['emb_dim']

        node_encoder = ASTNodeEncoder(args.emb_dim, num_nodetypes=len(nodetypes_mapping['type']),
                                      num_nodeattributes=len(nodeattributes_mapping['attr']), max_depth=20)

        train_scores, valid_scores, test_scores, epoch_times = [], [], [], []
        for run in range(args.run_times):
            # set random seed form 0 to 9
            random.seed(run)
            torch.manual_seed(run)
            np.random.seed(run)

            # prepare model
            model = MGPP(args, idx2vocab, node_encoder)

            # start training
            train_score, valid_score, epoch_time = model.fit(train_loader, valid_loader, args.epochs)

            # start testing
            test_score = model.predict(test_loader)['F1']

            train_scores.append(train_score)
            valid_scores.append(valid_score)
            test_scores.append(test_score)
            epoch_times.append(epoch_time)

            del model

        trn_val_tst = np.array([train_scores, valid_scores, test_scores]).T
        # record classification results
        np.savetxt(log_dir + '/' + str(args.lr) + '_' + str(args.emb_dim) + '.csv', trn_val_tst, fmt='%.05f')

        train_score_mean = round(np.mean(train_scores), 4)
        train_score_std = round(np.std(train_scores), 4)

        valid_score_mean = round(np.mean(valid_scores), 4)
        valid_score_std = round(np.std(valid_scores), 4)

        test_score_mean = round(np.mean(test_scores), 4)
        test_score_std = round(np.std(test_scores), 4)

        epoch_time_mean = round(np.mean(epoch_times), 4)

        temp = np.array([args.lr, args.weight_decay, args.emb_dim, epoch_time_mean,
                         train_score_mean, train_score_std, valid_score_mean, valid_score_std, test_score_mean, test_score_std])
        results.append(temp)

    temp_results = np.array(results, dtype=np.float32)
    best_valid_idx = np.argmax(temp_results[:, -4])
    best_result = temp_results[best_valid_idx, :]
    print(('Mean test Score:{:.4f}, Std test score:{:.4f}').format(best_result[-2], best_result[-1]))

    # record classification results
    np.savetxt(log_dir + '/results.csv', temp_results, fmt='%.05f')
    # record classification results
    records = ['lr', 'weight_decay', 'emb_dim', 'epoch_time', 'train_score', 'train_std',
               'valid_score', 'valid_std', 'test_score', 'test_std']
    result_file = open(os.path.join(log_dir, "best_result.txt"), 'w')
    for val in zip(records, best_result):
        result_file.write(val[0] + ':' + np.array2string(val[1])+'\n')
    result_file.close()


if __name__ == '__main__':
    pools = ['max', 'attention', 'set2set', 'mem']
    for pool in pools:
        main(pool)