


from __future__ import print_function
import os
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR, StepLR
from data import S3DIS
from model import DGCNN_semseg
from model_accelerate import DGCNN_semseg_full
import numpy as np
from torch.utils.data import DataLoader
from util import cal_loss, IOStream
import sklearn.metrics as metrics
from flops_counter import get_model_flops
from IPython import embed


def _init_(save_path):
    if not os.path.exists(save_path + 'checkpoints'):
        os.makedirs(save_path + 'checkpoints')
    if not os.path.exists(save_path + 'checkpoints/'+args.exp_name):
        os.makedirs(save_path + 'checkpoints/'+args.exp_name)
    if not os.path.exists(save_path + 'checkpoints/'+args.exp_name+'/'+'models'):
        os.makedirs(save_path + 'checkpoints/'+args.exp_name+'/'+'models')
    os.system('cp main_semseg.py ' + save_path + 'checkpoints'+'/'+args.exp_name+'/'+'main_semseg.py.backup')
    os.system('cp model.py ' + save_path + 'checkpoints' + '/' + args.exp_name + '/' + 'model.py.backup')
    os.system('cp util.py ' + save_path + 'checkpoints' + '/' + args.exp_name + '/' + 'util.py.backup')
    os.system('cp data.py ' + save_path + 'checkpoints' + '/' + args.exp_name + '/' + 'data.py.backup')


def calculate_sem_IoU(pred_np, seg_np):
    I_all = np.zeros(13)
    U_all = np.zeros(13)
    for sem_idx in range(seg_np.shape[0]):
        for sem in range(13):
            I = np.sum(np.logical_and(pred_np[sem_idx] == sem, seg_np[sem_idx] == sem))
            U = np.sum(np.logical_or(pred_np[sem_idx] == sem, seg_np[sem_idx] == sem))
            I_all[sem] += I
            U_all[sem] += U
    return I_all / U_all


def train(args, io):
    train_loader = DataLoader(S3DIS(partition='train', num_points=args.num_points, test_area=args.test_area, data_path=args.data_path),
                              num_workers=8, batch_size=args.batch_size, shuffle=True, drop_last=True)
    test_loader = DataLoader(S3DIS(partition='test', num_points=args.num_points, test_area=args.test_area, data_path=args.data_path),
                            num_workers=8, batch_size=args.test_batch_size, shuffle=True, drop_last=False)

    device = torch.device("cuda" if args.cuda else "cpu")

    #Try to load models
    if args.model == 'dgcnn':
        model = DGCNN_semseg(args).to(device)
    elif args.model == 'dgcnn_full':
        model = DGCNN_semseg_full(args).to(device)
    else:
        raise Exception("Not implemented")
    print(str(model))

    # param = sum([p.nelement() for p in model.parameters()])
    # print('Parameters: {} [k]'.format(param / 10. ** 6))
    # input_dim = (16, 9, args.num_points)
    # print(input_dim)
    # flops = get_model_flops(model, input_dim, print_per_layer_stat=False)
    # print('FLOPs {} [G]'.format(flops / 10. ** 9))

    model = nn.DataParallel(model)
    print("Let's use", torch.cuda.device_count(), "GPUs!")

    if args.use_sgd:
        print("Use SGD")
        opt = optim.SGD(model.parameters(), lr=args.lr*100, momentum=args.momentum, weight_decay=1e-4)
    else:
        print("Use Adam")
        opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4)

    if args.scheduler == 'cos':
        scheduler = CosineAnnealingLR(opt, args.epochs, eta_min=1e-3)
    elif args.scheduler == 'step':
        scheduler = StepLR(opt, 20, 0.5, args.epochs)

    criterion = cal_loss

    best_test_iou = 0
    for epoch in range(args.epochs):
        ####################
        # Train
        ####################
        train_loss = 0.0
        count = 0.0
        model.train()
        train_true_cls = []
        train_pred_cls = []
        train_true_seg = []
        train_pred_seg = []
        train_label_seg = []

        # start = torch.cuda.Event(enable_timing=True)  # used to get the runtime, can be commented during training.
        # end = torch.cuda.Event(enable_timing=True)  # used to get the runtime, can be commented during training.
        # test_time = []  # used to get the runtime, can be commented during training.
        for i, (data, seg) in enumerate(train_loader):
            # embed()
            # if i < 10:
            data, seg = data.to(device), seg.to(device)
            data = data.permute(0, 2, 1)
            batch_size = data.size()[0]
            opt.zero_grad()

            # torch.cuda.empty_cache()  # used to get the max GPU memory, can be commented during training.
            # start.record()
            seg_pred = model(data)
            # embed()
            # end.record()  # used to get the runtime, can be commented during training.
            # torch.cuda.synchronize()  # used to get the max GPU memory, can be commented during training.
            # test_time.append(start.elapsed_time(end))  # used to get the runtime, can be commented during training.

            seg_pred = seg_pred.permute(0, 2, 1).contiguous()
            loss = criterion(seg_pred.view(-1, 13), seg.view(-1,1).squeeze())
            loss.backward()
            opt.step()
            pred = seg_pred.max(dim=2)[1]               # (batch_size, num_points)
            count += batch_size
            train_loss += loss.item() * batch_size
            seg_np = seg.cpu().numpy()                  # (batch_size, num_points)
            pred_np = pred.detach().cpu().numpy()       # (batch_size, num_points)
            train_true_cls.append(seg_np.reshape(-1))       # (batch_size * num_points)
            train_pred_cls.append(pred_np.reshape(-1))      # (batch_size * num_points)
            train_true_seg.append(seg_np)
            train_pred_seg.append(pred_np)
            # else:
            #     break

        # GPU_num = torch.cuda.device_count()
        # mem = torch.cuda.max_memory_allocated(device=0)
        # if GPU_num > 1:
        #     for i in range(1, GPU_num):
        #         mem += torch.cuda.max_memory_allocated(device=i)
        # mem = mem / 1024.0 ** 3

        if args.scheduler == 'cos':
            scheduler.step()
        elif args.scheduler == 'step':
            if opt.param_groups[0]['lr'] > 1e-5:
                scheduler.step()
            if opt.param_groups[0]['lr'] < 1e-5:
                for param_group in opt.param_groups:
                    param_group['lr'] = 1e-5
        train_true_cls = np.concatenate(train_true_cls)
        train_pred_cls = np.concatenate(train_pred_cls)
        train_acc = metrics.accuracy_score(train_true_cls, train_pred_cls)
        avg_per_class_acc = metrics.balanced_accuracy_score(train_true_cls, train_pred_cls)
        train_true_seg = np.concatenate(train_true_seg, axis=0)
        train_pred_seg = np.concatenate(train_pred_seg, axis=0)
        train_ious = calculate_sem_IoU(train_pred_seg, train_true_seg)
        outstr = 'Train %d, loss: %.6f, train acc: %.6f, train avg acc: %.6f, train iou: %.6f.' \
                 % (epoch, train_loss*1.0/count, train_acc, avg_per_class_acc, np.mean(train_ious))
        io.cprint(outstr)

        ####################
        # Test
        ####################
        test_loss = 0.0
        count = 0.0
        model.eval()
        test_true_cls = []
        test_pred_cls = []
        test_true_seg = []
        test_pred_seg = []

        start = torch.cuda.Event(enable_timing=True)  # used to get the runtime, can be commented during training.
        end = torch.cuda.Event(enable_timing=True)  # used to get the runtime, can be commented during training.
        test_time = []  # used to get the runtime, can be commented during training.
        with torch.no_grad():
            for i, (data, seg) in enumerate(test_loader):
                # if i < 10:
                data, seg = data.to(device), seg.to(device)
                data = data.permute(0, 2, 1)
                batch_size = data.size()[0]

                torch.cuda.empty_cache()  # used to get the max GPU memory, can be commented during training.
                start.record()
                seg_pred = model(data)
                end.record()  # used to get the runtime, can be commented during training.
                torch.cuda.synchronize()  # used to get the max GPU memory, can be commented during training.
                test_time.append(start.elapsed_time(end))  # used to get the runtime, can be commented during training.

                seg_pred = seg_pred.permute(0, 2, 1).contiguous()
                loss = criterion(seg_pred.view(-1, 13), seg.view(-1,1).squeeze())
                pred = seg_pred.max(dim=2)[1]
                count += batch_size
                test_loss += loss.item() * batch_size
                seg_np = seg.cpu().numpy()
                pred_np = pred.detach().cpu().numpy()
                test_true_cls.append(seg_np.reshape(-1))
                test_pred_cls.append(pred_np.reshape(-1))
                test_true_seg.append(seg_np)
                test_pred_seg.append(pred_np)
                # else:
                #     break

        GPU_num = torch.cuda.device_count()
        mem = torch.cuda.max_memory_allocated(device=0)
        if GPU_num > 1:
            for i in range(1, GPU_num):
                mem += torch.cuda.max_memory_allocated(device=i)
        mem = mem / 1024.0 ** 3

        test_true_cls = np.concatenate(test_true_cls)
        test_pred_cls = np.concatenate(test_pred_cls)
        test_acc = metrics.accuracy_score(test_true_cls, test_pred_cls)
        avg_per_class_acc = metrics.balanced_accuracy_score(test_true_cls, test_pred_cls)
        test_true_seg = np.concatenate(test_true_seg, axis=0)
        test_pred_seg = np.concatenate(test_pred_seg, axis=0)
        test_ious = calculate_sem_IoU(test_pred_seg, test_true_seg)
        outstr = 'Test %d, loss: %.6f, test acc: %.6f, test avg acc: %.6f, test iou: %.6f.' \
                 ' Average test time: %.4f [ms], max GPU memory %.4f [GB]' \
                 % (epoch, test_loss*1.0/count, test_acc, avg_per_class_acc, np.mean(test_ious),
                    sum(test_time)/len(test_time), mem)
        io.cprint(outstr)
        if np.mean(test_ious) >= best_test_iou:
            best_test_iou = np.mean(test_ious)
            torch.save(model.state_dict(), args.save_path + 'checkpoints/%s/models/model_%s.t7' % (args.exp_name, args.test_area))


def test(args, io):
    all_true_cls = []
    all_pred_cls = []
    all_true_seg = []
    all_pred_seg = []
    for test_area in range(1,7):
        test_area = str(test_area)
        if (args.test_area == 'all') or (test_area == args.test_area):
            test_loader = DataLoader(S3DIS(partition='test', num_points=args.num_points, test_area=test_area, data_path=args.data_path),
                                     batch_size=args.test_batch_size, shuffle=False, drop_last=False)

            device = torch.device("cuda" if args.cuda else "cpu")

            #Try to load models
            if args.model == 'dgcnn':
                model = DGCNN_semseg(args).to(device)
            elif args.model == 'dgcnn_full':
                model = DGCNN_semseg_full(args).to(device)
            else:
                raise Exception("Not implemented")
            # print(str(model))

            # param = sum([p.nelement() for p in model.parameters()])
            # print('Parameters: {} [k]'.format(param / 10. ** 6))
            # input_dim = (16, 9, args.num_points)
            # print(input_dim)
            # flops = get_model_flops(model, input_dim, print_per_layer_stat=False)
            # print('FLOPs {} [G]'.format(flops / 10. ** 9))

            model = nn.DataParallel(model)
            if args.model_root:
                model.load_state_dict(torch.load(os.path.join(args.model_root, 'model_%s.t7' % test_area)))
            model = model.eval()
            test_acc = 0.0
            count = 0.0
            test_true_cls = []
            test_pred_cls = []
            test_true_seg = []
            test_pred_seg = []

            start = torch.cuda.Event(enable_timing=True)  # used to get the runtime, can be commented during training.
            end = torch.cuda.Event(enable_timing=True)  # used to get the runtime, can be commented during training.
            test_time = []  # used to get the runtime, can be commented during training.
            with torch.no_grad():
                for data, seg in test_loader:
                    data, seg = data.to(device), seg.to(device)
                    data = data.permute(0, 2, 1)
                    batch_size = data.size()[0]

                    torch.cuda.empty_cache()  # used to get the max GPU memory, can be commented during training.
                    start.record()
                    seg_pred = model(data)
                    end.record()  # used to get the runtime, can be commented during training.
                    torch.cuda.synchronize()  # used to get the max GPU memory, can be commented during training.
                    test_time.append(
                        start.elapsed_time(end))  # used to get the runtime, can be commented during training.

                    seg_pred = seg_pred.permute(0, 2, 1).contiguous()
                    pred = seg_pred.max(dim=2)[1]
                    seg_np = seg.cpu().numpy()
                    pred_np = pred.detach().cpu().numpy()
                    test_true_cls.append(seg_np.reshape(-1))
                    test_pred_cls.append(pred_np.reshape(-1))
                    test_true_seg.append(seg_np)
                    test_pred_seg.append(pred_np)

            GPU_num = torch.cuda.device_count()
            mem = torch.cuda.max_memory_allocated(device=0)
            if GPU_num > 1:
                for i in range(1, GPU_num):
                    mem += torch.cuda.max_memory_allocated(device=i)
            mem = mem / 1024.0 ** 3

            test_true_cls = np.concatenate(test_true_cls)
            test_pred_cls = np.concatenate(test_pred_cls)
            test_acc = metrics.accuracy_score(test_true_cls, test_pred_cls)
            avg_per_class_acc = metrics.balanced_accuracy_score(test_true_cls, test_pred_cls)
            test_true_seg = np.concatenate(test_true_seg, axis=0)
            test_pred_seg = np.concatenate(test_pred_seg, axis=0)
            test_ious = calculate_sem_IoU(test_pred_seg, test_true_seg)
            outstr = 'Test :: test area: %s, test acc: %.6f, test avg acc: %.6f, test iou: %.6f. Average test time: %.4f [ms], max GPU memory %.4f [GB]' % (test_area,
                                                                                                    test_acc,
                                                                                                    avg_per_class_acc,
                                                                                                    np.mean(test_ious),sum(test_time)/len(test_time), mem)
            io.cprint(outstr)
            all_true_cls.append(test_true_cls)
            all_pred_cls.append(test_pred_cls)
            all_true_seg.append(test_true_seg)
            all_pred_seg.append(test_pred_seg)

    if args.test_area == 'all':
        all_true_cls = np.concatenate(all_true_cls)
        all_pred_cls = np.concatenate(all_pred_cls)
        all_acc = metrics.accuracy_score(all_true_cls, all_pred_cls)
        avg_per_class_acc = metrics.balanced_accuracy_score(all_true_cls, all_pred_cls)
        all_true_seg = np.concatenate(all_true_seg, axis=0)
        all_pred_seg = np.concatenate(all_pred_seg, axis=0)
        all_ious = calculate_sem_IoU(all_pred_seg, all_true_seg)
        outstr = 'Overall Test :: test acc: %.6f, test avg acc: %.6f, test iou: %.6f' % (all_acc,
                                                                                         avg_per_class_acc,
                                                                                         np.mean(all_ious))
        io.cprint(outstr)


if __name__ == "__main__":
    # Training settings
    parser = argparse.ArgumentParser(description='Point Cloud Part Segmentation')
    parser.add_argument('--exp_name', type=str, default='exp', metavar='N',
                        help='Name of the experiment')
    parser.add_argument('--model', type=str, default='dgcnn', metavar='N',
                        help='Model to use')
    parser.add_argument('--dataset', type=str, default='S3DIS', metavar='N',
                        choices=['S3DIS'])
    parser.add_argument('--test_area', type=str, default=None, metavar='N',
                        choices=['1', '2', '3', '4', '5', '6', 'all'])
    parser.add_argument('--batch_size', type=int, default=32, metavar='batch_size',
                        help='Size of batch)')
    parser.add_argument('--test_batch_size', type=int, default=16, metavar='batch_size',
                        help='Size of batch)')
    parser.add_argument('--epochs', type=int, default=100, metavar='N',
                        help='number of episode to train ')
    parser.add_argument('--use_sgd', type=bool, default=True,
                        help='Use SGD')
    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                        help='learning rate (default: 0.001, 0.1 if using sgd)')
    parser.add_argument('--momentum', type=float, default=0.9, metavar='M',
                        help='SGD momentum (default: 0.9)')
    parser.add_argument('--scheduler', type=str, default='cos', metavar='N',
                        choices=['cos', 'step'],
                        help='Scheduler to use, [cos, step]')
    parser.add_argument('--no_cuda', type=bool, default=False,
                        help='enables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--eval', type=bool,  default=False,
                        help='evaluate the model')
    parser.add_argument('--num_points', type=int, default=4096,
                        help='num of points to use')
    parser.add_argument('--dropout', type=float, default=0.5,
                        help='dropout rate')
    parser.add_argument('--emb_dims', type=int, default=1024, metavar='N',
                        help='Dimension of embeddings')
    parser.add_argument('--k', type=int, default=20, metavar='N',
                        help='Num of nearest neighbors to use')
    parser.add_argument('--n', type=int, default=40, metavar='N',
                        help='Num of nearest neighbors in the larger neighborhood. '
                             'Used by DGCNN_knn_similarity and DGCNN_conv_reformulate.')
    parser.add_argument('--progressive', type=int, default=0, metavar='N',
                        help='Enlarge the area of the sampling neighborhood progressively.')
    parser.add_argument('--model_root', type=str, default='', metavar='N',
                        help='Pretrained model root')
    parser.add_argument('--save_path', type=str, default='', metavar='N',
                        help='The path to save the checkpoints')
    parser.add_argument('--data_path', type=str, default='', metavar='N',
                        help='The path to data is saved')
    args = parser.parse_args()

    _init_(args.save_path)

    io = IOStream(args.save_path + 'checkpoints/' + args.exp_name + '/run.log')
    io.cprint(str(args))

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    torch.manual_seed(args.seed)
    if args.cuda:
        io.cprint(
            'Using GPU : ' + str(torch.cuda.current_device()) + ' from ' + str(torch.cuda.device_count()) + ' devices')
        torch.cuda.manual_seed(args.seed)
    else:
        io.cprint('Using CPU')

    if not args.eval:
        train(args, io)
    else:
        test(args, io)
