import os
import torch
import torch.nn as nn
import wandb

from torch.utils.data import Sampler
from torch.utils.data import SequentialSampler
from transformers import Trainer
# from .trainer_ import Trainer
from transformers.trainer import (
    is_sagemaker_mp_enabled,
    get_parameter_names,
    has_length,
    ALL_LAYERNORM_LAYERS,
    # ShardedDDPOption,
    logger,_is_peft_model, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
)
from typing import List, Optional,TYPE_CHECKING, Any, Callable, Optional, Union
from common_utils.sampling_utils import get_one_class_per_batch, get_one_class_per_batch_v2



def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
    
    to_return = {k: t for k, t in named_params if "lora_" not in k}
    if require_grad_only:
        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
    return to_return


def get_peft_state_maybe_zero_3(named_params, bias):
    if bias == "none":
        to_return = {k: t for k, t in named_params if "lora_" in k}
    elif bias == "all":
        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
    elif bias == "lora_only":
        to_return = {}
        maybe_lora_bias = {}
        lora_bias_names = set()
        for k, t in named_params:
            if "lora_" in k:
                to_return[k] = t
                bias_name = k.split("lora_")[0] + "bias"
                lora_bias_names.add(bias_name)
            elif "bias" in k:
                maybe_lora_bias[k] = t
        for k, t in maybe_lora_bias:
            if bias_name in lora_bias_names:
                to_return[bias_name] = t
    else:
        raise NotImplementedError
    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
    return to_return

def store_all_param(named_params): 
    to_return = {k: t for k, t in named_params}
    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
    return to_return



def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
    return to_return


def split_to_even_chunks(indices, lengths, num_chunks):
    """
    Split a list of indices into `chunks` chunks of roughly equal lengths.
    """

    if len(indices) % num_chunks != 0:
        return [indices[i::num_chunks] for i in range(num_chunks)]

    num_indices_per_chunk = len(indices) // num_chunks

    chunks = [[] for _ in range(num_chunks)]
    chunks_lengths = [0 for _ in range(num_chunks)]
    for index in indices:
        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
        chunks[shortest_chunk].append(index)
        chunks_lengths[shortest_chunk] += lengths[index]
        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
            chunks_lengths[shortest_chunk] = float("inf")

    return chunks


def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
    assert all(l != 0 for l in lengths), "Should not have zero length."
    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
        # all samples are in the same modality
        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])

    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
    megabatch_size = world_size * batch_size
    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]

    last_mm = mm_megabatches[-1]
    last_lang = lang_megabatches[-1]
    additional_batch = last_mm + last_lang
    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
    megabatches = [megabatches[i] for i in megabatch_indices]

    if len(additional_batch) > 0:
        megabatches.append(sorted(additional_batch))

    return [i for megabatch in megabatches for i in megabatch]


def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
    indices = torch.randperm(len(lengths), generator=generator)
    megabatch_size = world_size * batch_size
    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]

    return [i for megabatch in megabatches for batch in megabatch for i in batch]


class LengthGroupedSampler(Sampler):
    r"""
    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
    keeping a bit of randomness.
    """

    def __init__(
        self,
        batch_size: int,
        world_size: int,
        lengths: Optional[List[int]] = None,
        generator=None,
        group_by_modality: bool = False,
    ):
        if lengths is None:
            raise ValueError("Lengths must be provided.")

        self.batch_size = batch_size
        self.world_size = world_size
        self.lengths = lengths
        self.generator = generator
        self.group_by_modality = group_by_modality

    def __len__(self):
        return len(self.lengths)

    def __iter__(self):
        if self.group_by_modality:
            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
        else:
            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
        return iter(indices)



class OneClassPerBatchSampler(Sampler):  # Sampler are running on each main worker, but not on dataloader worker.
    r"""
    Sampler that ensure only one entity class per batch, allowing contrastive loss to learn; 
    """
    def __init__(
        self, 
        data_source, 
        batch_size, 
        world_size, 
        generator=None,
    ) -> None:
        self.batch_size = batch_size
        self.world_size = world_size
        self.generator = generator
        self.data_source = data_source
        import pickle
        file_path = self.data_source.url_to_idx_path
        with open(file_path, 'rb') as f:
            self.url_to_idx = pickle.load(f)
        
        

    def __len__(self):
        return len(self.data_source)
    
    def __iter__(self):
        indices = get_one_class_per_batch_v2(
            self.world_size, 
            self.batch_size, 
            self.url_to_idx,
            len(self.data_source)
        )
        return iter(indices)
    


class LLaVATrainer(Trainer):

    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
      
        dataset = self.train_dataset

        if self.args.group_by_modality_length:
            lengths = self.train_dataset.modality_lengths
            return LengthGroupedSampler(
                self.args.train_batch_size,
                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
                lengths=lengths,
                group_by_modality=True,
            )
        elif self.args.one_class_per_batch: 
            return OneClassPerBatchSampler(
                dataset, 
                self.args.train_batch_size, 
                self.args.world_size, 
            )
        else:
            sampler =  super()._get_train_sampler()
            return sampler

    def _get_eval_sampler(self, eval_dataset) -> Optional[torch.utils.data.Sampler]:
        if self.args.one_class_per_batch: 
            return OneClassPerBatchSampler(
                eval_dataset, 
                self.args.eval_batch_size, 
                self.args.world_size, 
            )
        else: 
            sampler = SequentialSampler(eval_dataset)
            return sampler
      

    def prediction_step(
        self, model, inputs, prediction_loss_only, ignore_keys=None
    ):
        """
        Run the same forward pass as in training for evaluation/prediction.
        Always computes loss if labels exist in inputs.
        """
        model.eval()
        inputs = self._prepare_inputs(inputs)

        with torch.no_grad():
            # Forward pass: model should return (loss, logits, ...)
            outputs = model(**inputs)
            # loss = outputs.loss if hasattr(outputs, "loss") else None
           
            loss = outputs["loss"]
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Return in the same format as default Trainer
        if prediction_loss_only:
            return (loss, None, None)

        # labels = inputs.get("labels")
        # return (loss, logits, labels)
        return (loss, None, None)

    
    
    
    
    def create_optimizer(self):
        """
        Setup the optimizer.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
        """
        if is_sagemaker_mp_enabled():
            return super().create_optimizer()
        # if self.sharded_ddp == ShardedDDPOption.SIMPLE:
            # return super().create_optimizer()

        opt_model = self.model

        if self.optimizer is None:
            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
            decay_parameters = [name for name in decay_parameters if "bias" not in name]
            if self.args.mm_projector_lr is not None or self.args.ent_projector_lr is not None:
                self.args.mm_projector_lr = self.args.learning_rate if self.args.mm_projector_lr is None else self.args.mm_projector_lr
                self.args.ent_projector_lr = self.args.learning_rate if self.args.ent_projector_lr is None else self.args.ent_projector_lr
            

                mm_projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
                ent_projector_parameters = [name for name, _ in opt_model.named_parameters() if "ret_token_projector" in name]
              
                optimizer_grouped_parameters = [
                    {
                        "params": [  # normal decay params
                            p for n, p in opt_model.named_parameters() if (
                                n in decay_parameters and 
                                n not in mm_projector_parameters and 
                                n not in ent_projector_parameters and 
                                p.requires_grad
                            )
                        ],
                        "weight_decay": self.args.weight_decay,
                    },
                    {
                        "params": [  # normal no decay params
                            p for n, p in opt_model.named_parameters() if (
                                n not in decay_parameters and 
                                n not in mm_projector_parameters and 
                                n not in ent_projector_parameters and 
                                p.requires_grad
                            )
                        ],
                        "weight_decay": 0.0,
                    },
                    # Existing mm_projector layers
                    {
                        "params": [  # decay mm_projector params
                            p for n, p in opt_model.named_parameters() if (
                                n in decay_parameters and 
                                n in mm_projector_parameters and 
                                p.requires_grad
                            )
                        ],
                        "weight_decay": self.args.weight_decay,
                        "lr": self.args.mm_projector_lr,
                    },
                    {
                        "params": [  # no decay mm_projector params
                            p for n, p in opt_model.named_parameters() if (
                                n not in decay_parameters and 
                                n in mm_projector_parameters and 
                                p.requires_grad
                            )
                        ],
                        "weight_decay": 0.0,
                        "lr": self.args.mm_projector_lr,
                    },
                    # New ret_token_projector layers
                    {
                        "params": [  # decay ent_projector params
                            p for n, p in opt_model.named_parameters() if (
                                n in decay_parameters and 
                                n in ent_projector_parameters and 
                                p.requires_grad
                            )
                        ],
                        "weight_decay": self.args.weight_decay,
                        "lr": self.args.ent_projector_lr,
                    },
                    {
                        "params": [  # no decay ent_projector params
                            p for n, p in opt_model.named_parameters() if (
                                n not in decay_parameters and 
                                n in ent_projector_parameters and 
                                p.requires_grad
                            )
                        ],
                        "weight_decay": 0.0,
                        "lr": self.args.ent_projector_lr,
                    },
                    
                ]
            else:
                optimizer_grouped_parameters = [
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
                        ],
                        "weight_decay": self.args.weight_decay,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
                        ],
                        "weight_decay": 0.0,
                    },
                ]

            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)

            # if self.sharded_ddp == ShardedDDPOption.SIMPLE:
            #     self.optimizer = OSS(
            #         params=optimizer_grouped_parameters,
            #         optim=optimizer_cls,
            #         **optimizer_kwargs,
            #     )
            # else:
            #     self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
            #     if optimizer_cls.__name__ == "Adam8bit":
            #         import bitsandbytes

            #         manager = bitsandbytes.optim.GlobalOptimManager.get_instance()

            #         skipped = 0
            #         for module in opt_model.modules():
            #             if isinstance(module, nn.Embedding):
            #                 skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
            #                 logger.info(f"skipped {module}: {skipped/2**20}M params")
            #                 manager.register_module_override(module, "weight", {"optim_bits": 32})
            #                 logger.debug(f"bitsandbytes: will optimize {module} in fp32")
            #         logger.info(f"skipped: {skipped/2**20}M params")
            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
            if optimizer_cls.__name__ == "Adam8bit":
                import bitsandbytes

                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()

                skipped = 0
                for module in opt_model.modules():
                    if isinstance(module, nn.Embedding):
                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
                        logger.info(f"skipped {module}: {skipped/2**20}M params")
                        manager.register_module_override(module, "weight", {"optim_bits": 32})
                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
                logger.info(f"skipped: {skipped/2**20}M params")

        return self.optimizer


    def compute_loss(
        self,
        model: nn.Module,
        inputs: dict[str, Union[torch.Tensor, Any]],
        return_outputs: bool = False,
        num_items_in_batch: Optional[torch.Tensor] = None,
    ):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Args:
            model (`nn.Module`):
                The model to compute the loss for.
            inputs (`dict[str, Union[torch.Tensor, Any]]`):
                The input data for the model.
            return_outputs (`bool`, *optional*, defaults to `False`):
                Whether to return the model outputs along with the loss.
            num_items_in_batch (Optional[torch.Tensor], *optional*):
                The number of items in the batch. If num_items_in_batch is not passed,

        Returns:
            The loss of the model along with its output if return_outputs was set to True

        Subclass and override for custom behavior. If you are not using `num_items_in_batch` when computing your loss,
        make sure to overwrite `self.model_accepts_loss_kwargs` to `False`. Otherwise, the loss calculationg might be slightly inacurate when performing gradient accumulation.
        """
        if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        if self.model_accepts_loss_kwargs:
            kwargs = {}
            if num_items_in_batch is not None:
                kwargs["num_items_in_batch"] = num_items_in_batch
            inputs = {**inputs, **kwargs}
        outputs = model(**inputs)
        # outputs = model.forward_train(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            unwrapped_model = self.accelerator.unwrap_model(model)
            if _is_peft_model(unwrapped_model):
                model_name = unwrapped_model.base_model.model._get_name()
            else:
                model_name = unwrapped_model._get_name()
            # User-defined compute_loss function
            if self.compute_loss_func is not None:
                loss = self.compute_loss_func(outputs, labels, num_items_in_batch=num_items_in_batch)
            elif model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
                loss = self.label_smoother(outputs, labels, shift_labels=True)
            else:
                loss = self.label_smoother(outputs, labels)
        else:
            if isinstance(outputs, dict) and "loss" not in outputs:
                raise ValueError(
                    "The model did not return a loss from the inputs, only the following keys: "
                    f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
                )
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]

        if (
            self.args.average_tokens_across_devices
            and (self.model_accepts_loss_kwargs or self.compute_loss_func)
            and num_items_in_batch is not None
        ):
            loss *= self.accelerator.num_processes

        return (loss, outputs) if return_outputs else loss


    def _save_checkpoint(self, model, trial, metrics=None):
        from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
        run_dir = self._get_output_dir(trial=trial)
        output_dir = os.path.join(run_dir, checkpoint_folder)
    
        ## lora save
        if getattr(self.args, 'lora_enable', False):
            state_dict = get_peft_state_maybe_zero_3(
                model.named_parameters(), self.args.lora_bias
            )

            # state_dict = get_peft_state_maybe_zero_3(
            #     self.model.named_parameters(), "all_param"
            # )
            # state_dict = store_all_param(self.model.named_parameters())
            non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
                model.named_parameters()
            )
            if self.args.local_rank in [0, -1]:
                
                self.model.config.save_pretrained(output_dir)
                self.model.save_pretrained(output_dir, state_dict=state_dict)
                torch.save(non_lora_state_dict, os.path.join(output_dir, 'non_lora_trainables.bin'))
        

        if getattr(self.args, 'tune_mm_mlp_adapter', False):
            # Only save Adapter
            keys_to_match = ['mm_projector', 'vision_resampler', 'mm_cls_projector']
            if getattr(self.args, "use_im_start_end", False):
                keys_to_match.extend(['embed_tokens', 'embed_in'])

            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)

        if getattr(self.args, 'retrieval', False):
            ent_keys_to_match = ["query_projector", "context_projector" ,"autoreg_projector", "lm_head", 'embed_tokens', 'itc_temp', "retr_projector" , 'mm_cls_projector',"visual_processor", "visual_weight_projector", "rank_head", "pre_ent_token_projector"]
            ent_token_projector_weight = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), ent_keys_to_match)

        if self.args.local_rank == 0 or self.args.local_rank == -1:
            
            self.model.config.save_pretrained(output_dir)
            if getattr(self.args, 'tune_mm_mlp_adapter', False):
                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
            if getattr(self.args, 'retrieval', False):
                torch.save(ent_token_projector_weight, os.path.join(output_dir, f'ent_token_projector.bin'))
        
        self.save_state()
        # if not getattr(self.args, 'lora_enable', False) and not getattr(self.args, 'tune_mm_mlp_adapter', False) and not getattr(self.args, 'retrieval', False):
        super(LLaVATrainer, self)._save_checkpoint(model, trial)
    

    def _save(self, output_dir: Optional[str] = None, state_dict=None):
        if getattr(self.args, 'tune_mm_mlp_adapter', False):
            pass
        else:
            super(LLaVATrainer, self)._save(output_dir, state_dict)
    
