import os
import torch
import torch.nn as nn

from torch.utils.data import Sampler

from transformers import Trainer
from transformers.trainer import (
    is_sagemaker_mp_enabled,
    get_parameter_names,
    has_length,
    ALL_LAYERNORM_LAYERS,
    logger,
)
from typing import List, Optional
from transformers.modeling_utils import unwrap_model
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
import torch.nn as nn


def maybe_zero_3(param, ignore_status=False, name=None):
    from deepspeed import zero
    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
    if hasattr(param, "ds_id"):
        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
            if not ignore_status:
                print(name, 'no ignore status')
        with zero.GatheredParameters([param]):
            param = param.data.detach().cpu().clone()
    else:
        param = param.detach().cpu().clone()
    return param


def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
    return to_return


def split_to_even_chunks(indices, lengths, num_chunks):
    """
    Split a list of indices into `chunks` chunks of roughly equal lengths.
    """

    if len(indices) % num_chunks != 0:
        return [indices[i::num_chunks] for i in range(num_chunks)]

    num_indices_per_chunk = len(indices) // num_chunks

    chunks = [[] for _ in range(num_chunks)]
    chunks_lengths = [0 for _ in range(num_chunks)]
    for index in indices:
        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
        chunks[shortest_chunk].append(index)
        chunks_lengths[shortest_chunk] += lengths[index]
        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
            chunks_lengths[shortest_chunk] = float("inf")

    return chunks


# def get_modality_length_grouped_indices(lengths, types, batch_size, world_size, generator=None):
#     # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
#     assert all(l != 0 for l in lengths), "Should not have zero length."
#     if len(set(types)) == 1 and (all(l > 0 for l in lengths) or all(l < 0 for l in lengths)):  # 一种模态，全部为mm或者lang，其中mm为lazy、res、gres、is、seg
#         # all samples are in the same modality
#         return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
#     mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
#     lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])

#     mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
#     lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
#     megabatch_size = world_size * batch_size
#     mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
#     lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]

#     last_mm = mm_megabatches[-1]
#     last_lang = lang_megabatches[-1]
#     additional_batch = last_mm + last_lang
#     megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
#     megabatch_indices = torch.randperm(len(megabatches), generator=generator)
#     megabatches = [megabatches[i] for i in megabatch_indices]

#     if len(additional_batch) > 0:
#         megabatches.append(sorted(additional_batch))

#     return [i for megabatch in megabatches for i in megabatch]

def get_modality_length_grouped_indices(lengths, types, batch_size, world_size, generator=None):
    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
    ### Drop last
    assert all(l != 0 for l in lengths), "Should not have zero length."
    if len(set(types)) == 1 and (all(l > 0 for l in lengths) or all(l < 0 for l in lengths)):  # 一种模态，全部为mm或者lang，其中mm为lazy、res、gres、is、seg
        # all samples are in the same modality
        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
    
    res_shuffle, gres_shuffle, is_shuffle, seg_shuffle, lazy_shuffle, lang_shuffle, lnc_shuffle, lns_shuffle = [], [], [], [], [], [], [], []
    rec_shuffle, ovn_shuffle, reg_shuffle, nvn_shuffle, det_shuffle = [], [], [], [], []
    res_megabatches, gres_megabatches, is_megabatches, seg_megabatches, lazy_megabatches, lang_megabatches, lnc_megabatches, lns_megabatches = [], [], [], [], [], [], [], []
    rec_megabatches, ovn_megabatches, reg_megabatches, nvn_megabatches, det_megabatches = [], [], [], [], []
    last_res, last_gres, last_is, last_seg, last_lazy, last_lang, lnc_lang, lns_lang = [], [], [], [], [], [], [], []
    last_rec, last_ovn, last_reg, last_nvn, last_det = [], [], [], [], []
    types_all = list(set(types))
    for type in types_all:
        if type == 2:
            res_indices, res_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 2])
            res_shuffle = [res_indices[i] for i in get_length_grouped_indices(res_length, batch_size, world_size, generator=None)]
        elif type == 3:
            gres_indices, gres_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 3])
            gres_shuffle = [gres_indices[i] for i in get_length_grouped_indices(gres_length, batch_size, world_size, generator=None)]
        elif type == 4:
            is_indices, is_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 4])
            is_shuffle = [is_indices[i] for i in get_length_grouped_indices(is_length, batch_size, world_size, generator=None)]
        elif type == 5:
            seg_indices, seg_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 5])
            seg_shuffle = [seg_indices[i] for i in get_length_grouped_indices(seg_length, batch_size, world_size, generator=None)]
        elif type == 1:
            lazy_indices, lazy_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 1 and l > 0])
            lazy_shuffle = [lazy_indices[i] for i in get_length_grouped_indices(lazy_length, batch_size, world_size, generator=None)]
            if any(l < 0 for l in lengths):
                lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if types[i] == 1 and l < 0])
                lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
        elif type == 6:
            rec_indices, rec_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 6])
            rec_shuffle = [rec_indices[i] for i in get_length_grouped_indices(rec_length, batch_size, world_size, generator=None)]
        elif type == 7:
            ovn_indices, ovn_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 7])
            ovn_shuffle = [ovn_indices[i] for i in get_length_grouped_indices(ovn_length, batch_size, world_size, generator=None)]
        elif type == 8:
            lnc_indices, lnc_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 8])
            lnc_shuffle = [lnc_indices[i] for i in get_length_grouped_indices(lnc_length, batch_size, world_size, generator=None)]
        elif type == 9:
            lns_indices, lns_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 9])
            lns_shuffle = [lns_indices[i] for i in get_length_grouped_indices(lns_length, batch_size, world_size, generator=None)]
        elif type == 10:
            reg_indices, reg_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 10])
            reg_shuffle = [reg_indices[i] for i in get_length_grouped_indices(reg_length, batch_size, world_size, generator=None)]
        elif type == 11:
            nvn_indices, nvn_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 11])
            nvn_shuffle = [nvn_indices[i] for i in get_length_grouped_indices(nvn_length, batch_size, world_size, generator=None)]
        elif type == 12:
            det_indices, det_length = zip(*[(i, l) for i, l in enumerate(lengths) if types[i] == 12])
            det_shuffle = [det_indices[i] for i in get_length_grouped_indices(det_length, batch_size, world_size, generator=None)]

    megabatch_size = world_size * batch_size

    if len(res_shuffle) > 0:
        res_megabatches = [res_shuffle[i : i + megabatch_size] for i in range(0, len(res_shuffle), megabatch_size)]
        last_res = res_megabatches[-1]
    if len(gres_shuffle) > 0:
        gres_megabatches = [gres_shuffle[i : i + megabatch_size] for i in range(0, len(gres_shuffle), megabatch_size)]
        last_gres = gres_megabatches[-1]
    if len(is_shuffle) > 0:
        is_megabatches = [is_shuffle[i : i + megabatch_size] for i in range(0, len(is_shuffle), megabatch_size)]
        last_is = is_megabatches[-1]
    if len(seg_shuffle) > 0:
        seg_megabatches = [seg_shuffle[i : i + megabatch_size] for i in range(0, len(seg_shuffle), megabatch_size)]
        last_seg = seg_megabatches[-1]
    if len(lazy_shuffle) > 0:
        lazy_megabatches = [lazy_shuffle[i : i + megabatch_size] for i in range(0, len(lazy_shuffle), megabatch_size)]
        last_lazy = lazy_megabatches[-1]
    if len(lang_shuffle) > 0:
        lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
        last_lang = lang_megabatches[-1]
    if len(lnc_shuffle) > 0:
        lnc_megabatches = [lnc_shuffle[i : i + megabatch_size] for i in range(0, len(lnc_shuffle), megabatch_size)]
        last_lnc = lnc_megabatches[-1]
    if len(lns_shuffle) > 0:
        lns_megabatches = [lns_shuffle[i : i + megabatch_size] for i in range(0, len(lns_shuffle), megabatch_size)]
        last_lns = lns_megabatches[-1]
    if len(rec_shuffle) > 0:
        rec_megabatches = [rec_shuffle[i : i + megabatch_size] for i in range(0, len(rec_shuffle), megabatch_size)]
        last_rec = rec_megabatches[-1]
    if len(ovn_shuffle) > 0:
        ovn_megabatches = [ovn_shuffle[i : i + megabatch_size] for i in range(0, len(ovn_shuffle), megabatch_size)]
        last_ovn = ovn_megabatches[-1]
    if len(reg_shuffle) > 0:
        reg_megabatches = [reg_shuffle[i : i + megabatch_size] for i in range(0, len(reg_shuffle), megabatch_size)]
        last_reg = reg_megabatches[-1]
    if len(nvn_shuffle) > 0:
        nvn_megabatches = [nvn_shuffle[i : i + megabatch_size] for i in range(0, len(nvn_shuffle), megabatch_size)]
        last_nvn = nvn_megabatches[-1]
    if len(det_shuffle) > 0:
        det_megabatches = [det_shuffle[i : i + megabatch_size] for i in range(0, len(det_shuffle), megabatch_size)]
        last_det = det_megabatches[-1]

    res_part = res_megabatches[:-1] if res_megabatches else []
    gres_part = gres_megabatches[:-1] if gres_megabatches else []
    is_part = is_megabatches[:-1] if is_megabatches else []
    seg_part = seg_megabatches[:-1] if seg_megabatches else []
    lazy_part = lazy_megabatches[:-1] if lazy_megabatches else []
    lang_part = lang_megabatches[:-1] if lang_megabatches else []
    lnc_part = lnc_megabatches[:-1] if lnc_megabatches else []
    lns_part = lns_megabatches[:-1] if lns_megabatches else []
    rec_part = rec_megabatches[:-1] if rec_megabatches else []
    ovn_part = ovn_megabatches[:-1] if ovn_megabatches else []
    reg_part = reg_megabatches[:-1] if reg_megabatches else []
    nvn_part = nvn_megabatches[:-1] if nvn_megabatches else []
    det_part = det_megabatches[:-1] if det_megabatches else []
    # additional_batch = last_res + last_gres + last_is + last_seg + last_lazy + last_lang
    megabatches = res_part + gres_part + is_part + seg_part + lazy_part + lang_part + lnc_part + lns_part + rec_part + ovn_part + reg_part + nvn_part + det_part

    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
    megabatches = [megabatches[i] for i in megabatch_indices]

    # if len(additional_batch) > 0:
    #     megabatches.append(sorted(additional_batch))

    return [i for megabatch in megabatches for i in megabatch]


def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
    indices = torch.randperm(len(lengths), generator=generator)
    megabatch_size = world_size * batch_size
    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]

    return [i for megabatch in megabatches for batch in megabatch for i in batch]


class LengthGroupedSampler(Sampler):
    r"""
    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
    keeping a bit of randomness.
    """

    def __init__(
        self,
        batch_size: int,
        world_size: int,
        lengths: Optional[List[int]] = None,
        types: Optional[List[int]] = None,
        generator=None,
        group_by_modality: bool = False,
    ):
        if lengths is None:
            raise ValueError("Lengths must be provided.")

        self.batch_size = batch_size
        self.world_size = world_size
        self.lengths = lengths
        self.types = types
        self.generator = generator
        self.group_by_modality = group_by_modality

    def __len__(self):
        return len(self.lengths)

    def __iter__(self):
        if self.group_by_modality:
            indices = get_modality_length_grouped_indices(self.lengths, self.types, self.batch_size, self.world_size, generator=self.generator)
        else:
            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
        return iter(indices)


class TraceVLMTrainer(Trainer):

    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
        if self.train_dataset is None or not has_length(self.train_dataset):
            return None

        if self.args.group_by_modality_length:
            lengths = self.train_dataset.modality_lengths
            types = self.train_dataset.modality_types
            return LengthGroupedSampler(
                self.args.train_batch_size,
                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
                lengths=lengths,
                types=types,
                group_by_modality=True,
            )
        else:
            return super()._get_train_sampler()

    def create_optimizer(self):
        """
        Setup the optimizer.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
        """
        if is_sagemaker_mp_enabled():
            return super().create_optimizer()

        opt_model = self.model

        if self.optimizer is None:
            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
            decay_parameters = [name for name in decay_parameters if "bias" not in name]
            if self.args.mm_projector_lr is not None:
                projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
                optimizer_grouped_parameters = [
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
                        ],
                        "weight_decay": self.args.weight_decay,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
                        ],
                        "weight_decay": 0.0,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
                        ],
                        "weight_decay": self.args.weight_decay,
                        "lr": self.args.mm_projector_lr,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
                        ],
                        "weight_decay": 0.0,
                        "lr": self.args.mm_projector_lr,
                    },
                ]
            else:
                optimizer_grouped_parameters = [
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
                        ],
                        "weight_decay": self.args.weight_decay,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
                        ],
                        "weight_decay": 0.0,
                    },
                ]

            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)

            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
            if optimizer_cls.__name__ == "Adam8bit":
                import bitsandbytes

                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()

                skipped = 0
                for module in opt_model.modules():
                    if isinstance(module, nn.Embedding):
                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
                        logger.info(f"skipped {module}: {skipped/2**20}M params")
                        manager.register_module_override(module, "weight", {"optim_bits": 32})
                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
                logger.info(f"skipped: {skipped/2**20}M params")

        return self.optimizer

    def _save_checkpoint(self, model, trial, metrics=None):
        if getattr(self.args, 'tune_mm_mlp_adapter', False):
            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"

            run_dir = self._get_output_dir(trial=trial)
            output_dir = os.path.join(run_dir, checkpoint_folder)

            # Only save Adapter
            keys_to_match = ['mm_projector', 'vision_resampler']
            if getattr(self.args, "use_im_start_end", False):
                keys_to_match.extend(['embed_tokens', 'embed_in'])

            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)

            if self.args.local_rank == 0 or self.args.local_rank == -1:
                self.model.config.save_pretrained(output_dir)
                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
        else:
            super(TraceVLMTrainer, self)._save_checkpoint(model, trial, metrics)

    def _save(self, output_dir: Optional[str] = None, state_dict=None):
        if getattr(self.args, 'tune_mm_mlp_adapter', False):
            pass
        else:
            super(TraceVLMTrainer, self)._save(output_dir, state_dict)
