import glob
import os

import torch
import tqdm
import time
from torch.nn.utils import clip_grad_norm_
from pcdet.utils import common_utils, commu_utils
from pcdet.utils.spconv_utils import spconv

import torch.nn as nn


import numpy as np
try:
    import kornia
except:
    pass

from torch import Tensor

from torch.optim.lr_scheduler import ReduceLROnPlateau
from tools.train_utils.optimization.learning_schedules_fastai import CosineWarmupLR, OneCycle, ExponentialDecay

import torch.nn.functional as F
import torchvision

def load_data_to_gpu(batch_dict):
    for key, val in batch_dict.items():
        if not isinstance(val, np.ndarray):
            continue
        elif key in ["frame_id", "metadata", "calib"]:
            continue
        elif key in ["images"]:
            batch_dict[key] = kornia.image_to_tensor(val).float().cuda().contiguous()
        elif key in ["image_shape"]:
            batch_dict[key] = torch.from_numpy(val).int().cuda()
        else:
            batch_dict[key] = torch.from_numpy(val).float().cuda()

def train_one_epoch(
    detector_meanVFE,
    detector_backbone,
    inversion_model,
    optimizer,
    train_loader,
    model_func,
    lr_scheduler,
    accumulated_iter,
    optim_cfg,
    rank,
    tbar,
    total_it_each_epoch,
    dataloader_iter,
    tb_log=None,
    leave_pbar=False,
    loss_option=None,
    grid_size=None,
):
    if total_it_each_epoch == len(train_loader):
        dataloader_iter = iter(train_loader)

    if rank == 0:
        pbar = tqdm.tqdm(
            total=total_it_each_epoch,
            leave=leave_pbar,
            desc="train",
            dynamic_ncols=True,
        )
        data_time = common_utils.AverageMeter()
        batch_time = common_utils.AverageMeter()
        forward_time = common_utils.AverageMeter()

    for cur_it in range(total_it_each_epoch):
        end = time.time()
        try:
            batch = next(dataloader_iter)
        except StopIteration:
            dataloader_iter = iter(train_loader)
            batch = next(dataloader_iter)
            print("new iters")

        data_timer = time.time()
        cur_data_time = data_timer - end

        try:
            cur_lr = float(optimizer.lr)
        except:
            cur_lr = optimizer.param_groups[0]["lr"]

        if tb_log is not None:
            tb_log.add_scalar("meta_data/learning_rate", cur_lr, accumulated_iter)

        inversion_model.train()
        optimizer.zero_grad()

        load_data_to_gpu(batch)
        voxel_dict = detector_meanVFE(batch)

        voxel_features, voxel_coords = (
            voxel_dict["voxel_features"],
            voxel_dict["voxel_coords"],
        )
        batch_size = voxel_dict["batch_size"]
        grid_size = np.array(grid_size)
        sparse_shape = grid_size[::-1] + [1, 0, 0]  # 41,400,352

        input_sp_tensor = spconv.SparseConvTensor(
            features=voxel_features,
            indices=voxel_coords.int(),
            spatial_shape=sparse_shape,
            batch_size=batch_size,
        )
        
        backbone_output = detector_backbone(voxel_dict)

        
        x_conv4 = backbone_output["multi_scale_3d_features"]["x_conv4_1"]

        reg_result = inversion_model(x_conv4)
        reg_result = reg_result.dense()


        input_gt = input_sp_tensor.dense()[:,:3]


        loss_mse = F.mse_loss(reg_result[reg_result!=0], input_gt[reg_result!=0], reduction='mean')        
        inversion_model_loss = loss_mse

        tb_dict = {
            "total_loss": inversion_model_loss.item(),
        }
        assert isinstance(inversion_model_loss, Tensor)

        disp_dict = {}

        forward_timer = time.time()
        cur_forward_time = forward_timer - data_timer

        inversion_model_loss.backward()
        
        optimizer.step()

        accumulated_iter += 1

        cur_batch_time = time.time() - end

        # average reduce
        avg_data_time = commu_utils.average_reduce_value(cur_data_time)
        avg_forward_time = commu_utils.average_reduce_value(cur_forward_time)
        avg_batch_time = commu_utils.average_reduce_value(cur_batch_time)

        # log to console and tensorboard
        if rank == 0:
            data_time.update(avg_data_time)
            forward_time.update(avg_forward_time)
            batch_time.update(avg_batch_time)
            disp_dict.update(tb_dict)
            disp_dict.update(
                {
                    "lr": cur_lr,
                }
            )

            pbar.update()
            pbar.set_postfix(dict(total_it=accumulated_iter))
            tbar.set_postfix(disp_dict)
            tbar.refresh()

            if tb_log is not None:
                tb_log.add_scalar("meta_data/learning_rate", cur_lr, accumulated_iter)
                for key, val in tb_dict.items():
                    tb_log.add_scalar("train/" + key, val, accumulated_iter)

    if rank == 0:
        pbar.close()
    return accumulated_iter


def train_model(
    detector_meanVFE,
    detector_backbone,
    inversion_model,
    optimizer,
    train_loader,
    model_func,
    lr_scheduler,
    optim_cfg,
    start_epoch,
    total_epochs,
    start_iter,
    rank,
    tb_log,
    ckpt_save_dir,
    train_sampler=None,
    lr_warmup_scheduler=None,
    ckpt_save_interval=1,
    max_ckpt_save_num=50,
    merge_all_iters_to_one_epoch=False,
    loss_option=None,
    grid_size=None,
    pc_range=None,
    voxel_size=None,
):

    accumulated_iter = start_iter
    with tqdm.trange(
        start_epoch, total_epochs, desc="epochs", dynamic_ncols=True, leave=(rank == 0)
    ) as tbar:
        total_it_each_epoch = len(train_loader)
        if merge_all_iters_to_one_epoch:
            assert hasattr(train_loader.dataset, "merge_all_iters_to_one_epoch")
            train_loader.dataset.merge_all_iters_to_one_epoch(
                merge=True, epochs=total_epochs
            )
            total_it_each_epoch = len(train_loader) // max(total_epochs, 1)

        dataloader_iter = iter(train_loader)
        for cur_epoch in tbar:
            if train_sampler is not None:
                train_sampler.set_epoch(cur_epoch)

            # train one epoch
            if lr_warmup_scheduler is not None and cur_epoch < optim_cfg.WARMUP_EPOCH:
                cur_scheduler = lr_warmup_scheduler
            else:
                cur_scheduler = lr_scheduler
            accumulated_iter = train_one_epoch(
                detector_meanVFE,
                detector_backbone,
                inversion_model,
                optimizer,
                train_loader,
                model_func,
                lr_scheduler=cur_scheduler,
                accumulated_iter=accumulated_iter,
                optim_cfg=optim_cfg,
                rank=rank,
                tbar=tbar,
                tb_log=tb_log,
                leave_pbar=(cur_epoch + 1 == total_epochs),
                total_it_each_epoch=total_it_each_epoch,
                dataloader_iter=dataloader_iter,
                loss_option=loss_option,
                grid_size=grid_size,
            )

            # save trained model
            trained_epoch = cur_epoch + 1
            if trained_epoch % ckpt_save_interval == 0 and rank == 0:

                ckpt_list = glob.glob(str(ckpt_save_dir / "checkpoint_epoch_*.pth"))
                ckpt_list.sort(key=os.path.getmtime)

                if ckpt_list.__len__() >= max_ckpt_save_num:
                    for cur_file_idx in range(
                        0, len(ckpt_list) - max_ckpt_save_num + 1
                    ):
                        os.remove(ckpt_list[cur_file_idx])

                ckpt_name = ckpt_save_dir / ("checkpoint_epoch_%d" % trained_epoch)
                save_checkpoint(
                    checkpoint_state(inversion_model, optimizer, trained_epoch, accumulated_iter),
                    filename=ckpt_name,
                )


def model_state_to_cpu(model_state):
    model_state_cpu = type(model_state)()  # ordered dict
    for key, val in model_state.items():
        model_state_cpu[key] = val.cpu()
    return model_state_cpu


def checkpoint_state(model=None, optimizer=None, epoch=None, it=None):
    optim_state = optimizer.state_dict() if optimizer is not None else None
    if model is not None:
        if isinstance(model, torch.nn.parallel.DistributedDataParallel):
            model_state = model_state_to_cpu(model.module.state_dict())
        else:
            model_state = model.state_dict()
    else:
        model_state = None

    try:
        import pcdet

        version = "pcdet+" + pcdet.__version__
    except:
        version = "none"

    return {
        "epoch": epoch,
        "it": it,
        "model_state": model_state,
        "optimizer_state": optim_state,
        "version": version,
    }

def save_checkpoint(state, filename="checkpoint"):
    if False and "optimizer_state" in state:
        optimizer_state = state["optimizer_state"]
        state.pop("optimizer_state", None)
        optimizer_filename = "{}_optim.pth".format(filename)
        torch.save({"optimizer_state": optimizer_state}, optimizer_filename)

    filename = "{}.pth".format(filename)
    torch.save(state, filename)

