import os
import torch
import torch.nn as nn

from torch.utils.data import Sampler

from transformers import Trainer
from transformers.trainer import (
    is_sagemaker_mp_enabled,
    get_parameter_names,
    has_length,
    ALL_LAYERNORM_LAYERS,
    logger,
)
import random
from typing import List, Optional
import math
import time
import os
import sys
import shutil
import torch
import torch.nn as nn
import torch.optim
import torch.distributed as dist
import torch.nn.functional as F
from torch.utils.data import DataLoader, Sampler
from transformers.utils import is_datasets_available
from transformers.trainer_utils import seed_worker
from transformers import TrainerState
from transformers.integrations import hp_params
from transformers.trainer_pt_utils import get_model_param_count
from transformers.trainer_utils import speed_metrics
from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
from accelerate import Accelerator
from accelerate.utils import release_memory
from packaging import version
import logging
from transformers.integrations.deepspeed import deepspeed_init
from transformers.utils import is_torch_tpu_available
from transformers.trainer_utils import TrainOutput
import deepspeed
from safetensors.torch import save_file
import datasets

import torch.nn.functional as F


def maybe_zero_3(param, ignore_status=False, name=None):
    from deepspeed import zero
    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
    if hasattr(param, "ds_id"):
        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
            if not ignore_status:
                print(name, 'no ignore status')
        with zero.GatheredParameters([param]):
            param = param.data.detach().cpu().clone()
    else:
        param = param.detach().cpu().clone()
    return param
def save_split_prompt_embeddings_init(weight, num_split, save_dir):
    # weight = sdict["module.prompt_encoder.default.embedding.weight"]
    total = weight.shape[0]
    split_size = math.ceil(total / num_split)
    for i in range(num_split):
        start = i * split_size
        end = min((i + 1) * split_size, total)
        split_weight = weight[start:end,:].clone()
        split_dict = {"prompt_embeddings": split_weight}
        save_file(split_dict, os.path.join(save_dir, f"init_memorytokens_part{i+1}.safetensors"))  # 用safetensors保存

def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
    return to_return


# def split_to_even_chunks(indices, lengths, num_chunks):
#     """
#     Split a list of indices into `chunks` chunks of roughly equal lengths.
#     """

#     if len(indices) % num_chunks != 0:
#         return [indices[i::num_chunks] for i in range(num_chunks)]

#     num_indices_per_chunk = len(indices) // num_chunks

#     chunks = [[] for _ in range(num_chunks)]
#     chunks_lengths = [0 for _ in range(num_chunks)]
#     for index in indices:
#         shortest_chunk = chunks_lengths.index(min(chunks_lengths))
#         chunks[shortest_chunk].append(index)
#         chunks_lengths[shortest_chunk] += lengths[index]
#         if len(chunks[shortest_chunk]) == num_indices_per_chunk:
#             chunks_lengths[shortest_chunk] = float("inf")

#     return chunks


# def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
#     # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
#     assert all(l != 0 for l in lengths), "Should not have zero length."
#     if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
#         # all samples are in the same modality
#         return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
#     mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
#     lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])

#     mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
#     lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
#     megabatch_size = world_size * batch_size
#     mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
#     lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]

#     last_mm = mm_megabatches[-1]
#     last_lang = lang_megabatches[-1]
#     additional_batch = last_mm + last_lang
#     megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
#     megabatch_indices = torch.randperm(len(megabatches), generator=generator)
#     megabatches = [megabatches[i] for i in megabatch_indices]

#     if len(additional_batch) > 0:
#         megabatches.append(sorted(additional_batch))

#     return [i for megabatch in megabatches for i in megabatch]


# def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
#     # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
#     indices = torch.randperm(len(lengths), generator=generator)
#     megabatch_size = world_size * batch_size
#     megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
#     megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
#     megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]

#     return [i for megabatch in megabatches for batch in megabatch for i in batch]


# class LengthGroupedSampler(Sampler):
#     r"""
#     Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
#     keeping a bit of randomness.
#     """

#     def __init__(
#         self,
#         batch_size: int,
#         world_size: int,
#         lengths: Optional[List[int]] = None,
#         generator=None,
#         group_by_modality: bool = False,
#     ):
#         if lengths is None:
#             raise ValueError("Lengths must be provided.")

#         self.batch_size = batch_size
#         self.world_size = world_size
#         self.lengths = lengths
#         self.generator = generator
#         self.group_by_modality = group_by_modality
#         self.knowledge_id_map = {}

#     def __len__(self):
#         return len(self.lengths)

#     def __iter__(self):
#         if self.group_by_modality:
#             indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
#         else:
#             indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
#         return iter(indices)
#     def add_knowledge_id_map(self, knowledge_id_map):
#         """
#         Add a mapping of indices to knowledge IDs.
#         This can be used to track which knowledge ID corresponds to which index in the dataset.
#         """
#         self.knowledge_id_map = knowledge_id_map
#         print(f"Knowledge ID map added in sampler with {len(self.knowledge_id_map)} entries.")


# class KnowledgeIdDataLoader(DataLoader):
#     def __init__(self, dataset, *args, **kwargs):
#         super().__init__(dataset, *args, **kwargs)
#         self.knowledge_id_map = {}
#         for idx in range(len(dataset)):
#             item = dataset[idx]
#             # 假设每个item是dict且有'knowledge_id'字段
#             self.knowledge_id_map[idx] = item['knowledge_id']
#         print(f"KnowledgeIdDataLoader initialized with {len(self.knowledge_id_map)} items.")
#         print(f"Knowledge IDs: {self.knowledge_id_map}")
#         self.sampler.add_knowledge_id_map(self.knowledge_id_map)
import random
import torch
from collections import defaultdict
from torch.utils.data import Sampler, DataLoader
from typing import List, Optional

def split_to_even_chunks(indices, lengths, num_chunks):
    """
    Split a list of indices into `chunks` chunks of roughly equal lengths.
    """
    if len(indices) % num_chunks != 0:
        return [indices[i::num_chunks] for i in range(num_chunks)]

    num_indices_per_chunk = len(indices) // num_chunks
    chunks = [[] for _ in range(num_chunks)]
    chunks_lengths = [0 for _ in range(num_chunks)]
    for index in indices:
        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
        chunks[shortest_chunk].append(index)
        chunks_lengths[shortest_chunk] += lengths[index]
        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
            chunks_lengths[shortest_chunk] = float("inf")
    return chunks

def get_length_grouped_indices(lengths, batch_size, world_size, knowledge_id_map, generator=None):
    """
    Group indices by length and knowledge_id, ensuring each batch has the same knowledge_id.
    
    Args:
        lengths: List of sample lengths.
        batch_size: Number of samples per batch per process.
        world_size: Number of processes in distributed training.
        knowledge_id_map: Dict mapping indices to knowledge_ids.
        generator: Random number generator for shuffling.
    
    Returns:
        List of indices where each batch is from the same knowledge_id.
    """
    # Group indices by knowledge_id
    knowledge_id_to_indices = defaultdict(list)
    for idx, kid in knowledge_id_map.items():
        knowledge_id_to_indices[kid].append(idx)
    
    # Create batches within each knowledge_id
    all_batches = []
    for indices in knowledge_id_to_indices.values():
        # Shuffle indices within this knowledge_id
        shuffled_indices = torch.randperm(len(indices), generator=generator).tolist()
        # Split into batches and sort each by length
        for i in range(0, len(shuffled_indices), batch_size):
            batch_indices = shuffled_indices[i:i + batch_size]
            sorted_batch = sorted(batch_indices, key=lambda idx: lengths[indices[idx]], reverse=True)
            batch = [indices[idx] for idx in sorted_batch]
            all_batches.append(batch)
    
    # Shuffle batches to mix knowledge_ids
    # random.shuffle(all_batches, random=generator)
    random.shuffle(all_batches)
    
    # Group into megabatches for each process
    megabatches = [all_batches[i:i + world_size] for i in range(0, len(all_batches), world_size)]
    
    # Flatten indices for the sampler
    return [i for megabatch in megabatches for batch in megabatch for i in batch]

# def get_modality_length_grouped_indices(lengths, batch_size, world_size, knowledge_id_map, generator=None):
#     """
#     Group indices by modality and knowledge_id, ensuring each batch has the same knowledge_id.
    
#     Args:
#         lengths: List of sample lengths (positive for mm, negative for lang).
#         batch_size: Number of samples per batch per process.
#         world_size: Number of processes in distributed training.
#         knowledge_id_map: Dict mapping indices to knowledge_ids.
#         generator: Random number generator for shuffling.
    
#     Returns:
#         List of indices with batches grouped by knowledge_id and modality.
#     """
#     assert all(l != 0 for l in lengths), "Should not have zero length."
#     if all(l > 0 for l in lengths) or all(l <  0 for l in lengths):
#         return get_length_grouped_indices(lengths, batch_size, world_size, knowledge_id_map, generator=generator)
    
#     # Separate modalities
#     mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
#     lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
    
#     # Create knowledge_id maps for each modality
#     mm_kid_map = {i: knowledge_id_map[mm_indices[i]] for i in range(len(mm_indices))}
#     lang_kid_map = {i: knowledge_id_map[lang_indices[i]] for i in range(len(lang_indices))}
    
#     # Get grouped indices for each modality
#     mm_shuffle = get_length_grouped_indices(mm_lengths, batch_size, world_size, mm_kid_map, generator=generator)
#     lang_shuffle = get_length_grouped_indices(lang_lengths, batch_size, world_size, lang_kid_map, generator=generator)
    
#     # Map back to original indices
#     mm_shuffle = [mm_indices[i] for i in mm_shuffle]
#     lang_shuffle = [lang_indices[i] for i in lang_shuffle]
    
#     # Form megabatches
#     megabatch_size = world_size * batch_size
#     mm_megabatches = [mm_shuffle[i:i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
#     lang_megabatches = [lang_shuffle[i:i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
    
#     # Handle last batches and shuffle
#     last_mm = mm_megabatches[-1] if mm_megabatches else []
#     last_lang = lang_megabatches[-1] if lang_megabatches else []
#     additional_batch = last_mm + last_lang
#     megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
#     megabatch_indices = torch.randperm(len(megabatches), generator=generator)
#     megabatches = [megabatches[i] for i in megabatch_indices]
    
#     if additional_batch:
#         megabatches.append(sorted(additional_batch))
    
#     return [i for megabatch in megabatches for i in megabatch]
def get_modality_length_grouped_indices(lengths, batch_size, world_size, knowledge_id_map, generator=None):
    """
    Group indices by knowledge_id and modality, ensuring each batch has the same knowledge_id and modality.
    """
    from collections import defaultdict
    import torch
    import random

    # 1. 按 knowledge_id 分组
    kid_to_indices = defaultdict(list)
    for idx, kid in knowledge_id_map.items():
        kid_to_indices[kid].append(idx)

    all_batches = []
    for kid, indices in kid_to_indices.items():
        # 2. 按 modality 分组
        mm_indices = [i for i in indices if lengths[i] > 0]
        lang_indices = [i for i in indices if lengths[i] < 0]

        # 3. shuffle
        if mm_indices:
            mm_indices_shuffled = [mm_indices[i] for i in torch.randperm(len(mm_indices), generator=generator)]
            for i in range(0, len(mm_indices_shuffled), batch_size):
                batch = mm_indices_shuffled[i:i + batch_size]
                if batch:
                    all_batches.append(batch)
        if lang_indices:
            lang_indices_shuffled = [lang_indices[i] for i in torch.randperm(len(lang_indices), generator=generator)]
            for i in range(0, len(lang_indices_shuffled), batch_size):
                batch = lang_indices_shuffled[i:i + batch_size]
                if batch:
                    all_batches.append(batch)

    # 4. shuffle batch 顺序
    # random.shuffle(all_batches)
    
    # 5. 按 world_size 拼 megabatch
    megabatches = [all_batches[i:i + world_size] for i in range(0, len(all_batches), world_size)]
    print()
    # print("megabatches", megabatches)
    print("megabatches", [batch for batch in megabatches])
    print("megabatches", [i for megabatch in megabatches for batch in megabatch for i in batch])
    # 6. flatten
    return [i for megabatch in megabatches for batch in megabatch for i in batch]
class LengthGroupedSampler(Sampler):
    """
    Sampler that groups indices by length and knowledge_id while maintaining randomness.
    """
    def __init__(
        self,
        batch_size: int,
        world_size: int,
        lengths: Optional[List[int]] = None,
        generator=None,
        group_by_modality: bool = False,
    ):
        if lengths is None:
            raise ValueError("Lengths must be provided.")
        self.batch_size = batch_size
        self.world_size = world_size
        self.lengths = lengths
        self.generator = generator
        self.group_by_modality = group_by_modality
        self.knowledge_id_map = {}

    def __len__(self):
        return len(self.lengths)

    def __iter__(self):
        if self.group_by_modality:
            indices = get_modality_length_grouped_indices(
                self.lengths, self.batch_size, self.world_size, self.knowledge_id_map, self.generator
            )
        else:
            indices = get_length_grouped_indices(
                self.lengths, self.batch_size, self.world_size, self.knowledge_id_map, self.generator
            )
        return iter(indices)

    def add_knowledge_id_map(self, knowledge_id_map):
        """
        Add a mapping of indices to knowledge IDs.
        """
        self.knowledge_id_map = knowledge_id_map
        print(f"Knowledge ID map added in sampler with {len(self.knowledge_id_map)} entries.")

class KnowledgeIdDataLoader(DataLoader):
    """
    DataLoader that ensures batches correspond to the same knowledge_id.
    """
    def __init__(self, dataset, *args, **kwargs):
        super().__init__(dataset, *args, **kwargs)
        self.knowledge_id_map = {}
        for idx in range(len(dataset)):
            item = dataset[idx]
            self.knowledge_id_map[idx] = item['knowledge_id']
        print(f"KnowledgeIdDataLoader initialized with {len(self.knowledge_id_map)} items.")
        print(f"Knowledge IDs: {self.knowledge_id_map}")
        self.sampler.add_knowledge_id_map(self.knowledge_id_map)    
        
            
# class KnowledgeGrad:
#     def __init__(self, knowledge_length=2, Threshold=0.5,Mod = 'mean'):
#         self.knowledge_length = knowledge_length
#         self.knowledge_in_ourset = []
#         self.last_knowledge_id = 0
#         self.Knowledge_type_grad = {}
#         self.Threshold = Threshold
#         self.Mod = Mod
#         self.cosine = {}
#         self.cpu = torch.device('cpu')
        
#     def get_knowledge_length(self):
#         """
#         Retrieve the length of knowledge.
#         Returns:
#             int: The length of knowledge.
#         """
#         return self.knowledge_length
#     def get_synthetic_grad(self, param_name, without_knowledge_id):
#         """
#         Retrieve the synthetic gradient for a specific parameter without a knowledge ID.
#         Args:
#             param_name (str): The name of the parameter.
#             without_knowledge_id (int): The ID of the knowledge to exclude.
#         Returns:
#             List[torch.Tensor]: gradients for the specified parameter excluding the given knowledge ID.
#         """
#         if param_name in self.Knowledge_type_grad:
#             grads = [
#                 grad for knowledge_id, grad in self.Knowledge_type_grad[param_name].items()
#                 if knowledge_id != without_knowledge_id
#             ]
#             if grads:
#                 return torch.stack(grads, dim=0).sum(dim=0)  # 矢量和
#         return None
#     def set_grad(self, param_name, knowledge_id, grad):
#         """        
#         Store the gradient for a specific grad with knowledge ID.
#         Args:
#             param_name (str): The name of the parameter.
#             knowledge_id (int): The ID of the knowledge.
#             grad (torch.Tensor): The gradient tensor to store.
#         """

#         if knowledge_id > self.knowledge_length:
#             raise ValueError(f"knowledge_id {knowledge_id} exceeds knowledge_length {self.knowledge_length}")
#         if not isinstance(grad, torch.Tensor):
#             print(f"Expected {param_name} grad to be a torch.Tensor, got {type(grad)}")
#             return
#             raise ValueError(f"Expected grad to be a torch.Tensor, got {type(grad)}")
#         if not knowledge_id in self.knowledge_in_ourset: 
#             self.knowledge_in_ourset.append(knowledge_id)
#         if param_name not in self.Knowledge_type_grad:
#             self.Knowledge_type_grad[param_name] = {}
#         self.Knowledge_type_grad[param_name][knowledge_id] = grad
#     def get_grad(self, param_name, knowledge_id):
#         """
#         Retrieve the stored gradient for a specific parameter and knowledge ID.
#         Args:
#             param_name (str): The name of the parameter.
#             knowledge_id (int): The ID of the knowledge.
#         Returns:
#             List[torch.Tensor]: A list of gradients for the specified parameter and knowledge ID.
#         """
#         if param_name in self.Knowledge_type_grad and knowledge_id in self.Knowledge_type_grad[param_name]:
#             return self.Knowledge_type_grad[param_name][knowledge_id]
#         else:
#             return None
        
#     def cosine_similarity_with_rest(self, param_name, knowledge_id , grad_self):
#         """
#         Calculate cosine similarity between the gradient of a given knowledge ID
#         and the vector sum of all other gradients.
#         """
#         if len(self.knowledge_in_ourset) == 1 and self.knowledge_in_ourset[0] == knowledge_id:
#             return -1.0
            
#         grad_rest_sum = self.get_synthetic_grad(param_name, without_knowledge_id=knowledge_id)
#         if grad_self is None or grad_rest_sum is None:
#             return None
#         grad_self_flat = grad_self.view(-1)
#         grad_rest_flat = grad_rest_sum.view(-1)
#         return F.cosine_similarity(grad_self_flat.unsqueeze(0), grad_rest_flat.unsqueeze(0)).item()
    
#     def update_param(self, param_name, knowledge_id , grad_self):
#         """_summary_

#         Args:
#             param_name (_type_): _description_
#             knowledge_id (_type_): _description_
#             grad_self (_type_): _description_

#         Returns:
#             _type_: _description_
#         """
#         self.set_grad(param_name, knowledge_id, grad_self)
#         if len(self.knowledge_in_ourset) == 1 and knowledge_id in self.knowledge_in_ourset:
#             return True
#         elif len(self.knowledge_in_ourset) == 0:
#             return False
#         if self.Mod == 'mean':
#             cosine_sim = self.cosine_similarity_with_rest(param_name, knowledge_id , grad_self)
#         else:
#             cosine_sim = self.max_cosine_with_rest(param_name, knowledge_id, grad_self)
#         self.add_cosine(param_name,cosine_sim,knowledge_id)
#         if cosine_sim < self.Threshold:
#             return True
#         else:
#             return False
#     def add_cosine(self,param_name,cosine,knowledge_id):
#         if param_name not in self.cosine:
#             self.cosine[param_name] = {}
#         if knowledge_id not in self.cosine[param_name]:
#             self.cosine[param_name][knowledge_id] = []
#         self.cosine[param_name][knowledge_id].append(cosine)
#     def print_cosine(self):
#         print(self.cosine)
#     def get_cosine(self):
#         return self.cosine
#     def max_cosine_with_rest(self, param_name, knowledge_id,grad_self):
#         """_summary_

#         Args:
#             param_name (_type_): _description_
#             knowledge_id (_type_): _description_
#             grad_self (_type_): _description_

#         Returns:
#             _type_: _description_
#         """
#         if param_name not in self.Knowledge_type_grad:
#             return None
#         grads_dict = self.Knowledge_type_grad[param_name]
#         if knowledge_id not in grads_dict:
#             return None
#         grad_self = grad_self.view(1, -1)  # shape: (1, D)
#         other_grads = [
#             grad.view(-1) for k, grad in grads_dict.items() if k != knowledge_id
#         ]
#         if not other_grads:
#             return None
#         other_grads = torch.stack(other_grads, dim=0)  # shape: (N-1, D)
#         cos_sims = F.cosine_similarity(grad_self, other_grads)  # shape: (N-1,)
#         return cos_sims.abs().max().item()




class KnowledgeGrad:
    def __init__(self, knowledge_length=2, Threshold=0.2, Mod='mean',K = 0.5):
        self.knowledge_length = knowledge_length
        self.knowledge_in_ourset = []
        self.last_knowledge_id = 0
        self.Knowledge_type_grad = {}
        self.Threshold = Threshold
        self.Mod = Mod
        self.cosine = {}
        self.cpu = torch.device('cpu')
        self.compute_device = torch.device('cuda')
        self.K = K
        
    def get_knowledge_length(self):
        """
        Retrieve the length of knowledge.
        Returns:
            int: The length of knowledge.
        """
        return self.knowledge_length
    def get_synthetic_grad(self, param_name, without_knowledge_id):
        """
        Retrieve the synthetic gradient for a specific parameter without a knowledge ID.
        Args:
            param_name (str): The name of the parameter.
            without_knowledge_id (int): The ID of the knowledge to exclude.
        Returns:
            List[torch.Tensor]: gradients for the specified parameter excluding the given knowledge ID.
        """
        if param_name in self.Knowledge_type_grad:
            grads = [
                grad.to(self.compute_device) for knowledge_id, grad in self.Knowledge_type_grad[param_name].items()
                if knowledge_id != without_knowledge_id
            ]
            if grads:
                return torch.stack(grads, dim=0).sum(dim=0)  # 矢量和
        return None
    def set_grad(self, param_name, knowledge_id, grad):
        """        
        Store the gradient for a specific grad with knowledge ID.
        Args:
            param_name (str): The name of the parameter.
            knowledge_id (int): The ID of the knowledge.
            grad (torch.Tensor): The gradient tensor to store.
        """

        if knowledge_id > self.knowledge_length:
            raise ValueError(f"knowledge_id {knowledge_id} exceeds knowledge_length {self.knowledge_length}")
        if not isinstance(grad, torch.Tensor):
            print(f"Expected {param_name} grad to be a torch.Tensor, got {type(grad)}")
            return
            raise ValueError(f"Expected grad to be a torch.Tensor, got {type(grad)}")
        if not knowledge_id in self.knowledge_in_ourset: 
            self.knowledge_in_ourset.append(knowledge_id)
        if param_name not in self.Knowledge_type_grad:
            self.Knowledge_type_grad[param_name] = {}
        self.Knowledge_type_grad[param_name][knowledge_id] = grad.to(self.cpu)
    def get_grad(self, param_name, knowledge_id):
        """
        Retrieve the stored gradient for a specific parameter and knowledge ID.
        Args:
            param_name (str): The name of the parameter.
            knowledge_id (int): The ID of the knowledge.
        Returns:
            List[torch.Tensor]: A list of gradients for the specified parameter and knowledge ID.
        """
        if param_name in self.Knowledge_type_grad and knowledge_id in self.Knowledge_type_grad[param_name]:
            return self.Knowledge_type_grad[param_name][knowledge_id]
        else:
            return None
        
    def cosine_similarity_with_rest(self, param_name, knowledge_id , grad_self):
        """
        Calculate cosine similarity between the gradient of a given knowledge ID
        and the vector sum of all other gradients.
        """
        if len(self.knowledge_in_ourset) == 1 and self.knowledge_in_ourset[0] == knowledge_id:
            return -1.0
            
        grad_rest_sum = self.get_synthetic_grad(param_name, without_knowledge_id=knowledge_id)
        if grad_self is None or grad_rest_sum is None:
            return None
        grad_self_flat = grad_self.to(self.compute_device).view(-1)
        grad_rest_flat = grad_rest_sum.view(-1)
        return F.cosine_similarity(grad_self_flat.unsqueeze(0), grad_rest_flat.unsqueeze(0)).item()
    def update_grad(self, param_name, knowledge_id, grad_self):
        """
        Update the gradient by multiplying it with a weight based on cosine similarity.
        The weight is calculated as e^(k*(cosine_similarity + 1)).
        Args:
            param_name (str): The name of the parameter.
            knowledge_id (int): The ID of the knowledge.
            grad_self (torch.Tensor): The gradient tensor to update.
        Returns:
            torch.Tensor: The weighted gradient, or None if cosine similarity cannot be computed.
        """
        self.set_grad(param_name, knowledge_id, grad_self)
        if len(self.knowledge_in_ourset) == 1 and knowledge_id in self.knowledge_in_ourset:
            return grad_self
        elif len(self.knowledge_in_ourset) == 0:
            return grad_self
        
        # cosine_sim = self.cosine_similarity_with_rest(param_name, knowledge_id, grad_self)
        if self.Mod == 'mean':
            cosine_sim = self.cosine_similarity_with_rest(param_name, knowledge_id , grad_self)
        else:
            cosine_sim = self.max_cosine_with_rest(param_name, knowledge_id, grad_self)
        if cosine_sim is None:
            return grad_self
        # Calculate weight as e^(k*(cosine_similarity + 1)), where k is a scaling factor (default k=1)
        weight = math.exp(self.K * (cosine_sim + 1))
        # Move gradient to compute device for calculation
        weighted_grad = grad_self.to(self.compute_device) * weight
        return weighted_grad
    def update_param(self, param_name, knowledge_id , grad_self):
        """_summary_

        Args:
            param_name (_type_): _description_
            knowledge_id (_type_): _description_
            grad_self (_type_): _description_

        Returns:
            _type_: _description_
        """
        self.set_grad(param_name, knowledge_id, grad_self)
        if len(self.knowledge_in_ourset) == 1 and knowledge_id in self.knowledge_in_ourset:
            return True
        elif len(self.knowledge_in_ourset) == 0:
            return False
        if self.Mod == 'mean':
            cosine_sim = self.cosine_similarity_with_rest(param_name, knowledge_id , grad_self)
        else:
            cosine_sim = self.max_cosine_with_rest(param_name, knowledge_id, grad_self)
        self.add_cosine(param_name,cosine_sim,knowledge_id)
        if cosine_sim < self.Threshold:
            return True
        else:
            return False
    def add_cosine(self,param_name,cosine,knowledge_id):
        if param_name not in self.cosine:
            self.cosine[param_name] = {}
        if knowledge_id not in self.cosine[param_name]:
            self.cosine[param_name][knowledge_id] = []
        self.cosine[param_name][knowledge_id].append(cosine)
    def print_cosine(self):
        print(self.cosine)
    def get_cosine(self):
        return self.cosine
    def max_cosine_with_rest(self, param_name, knowledge_id,grad_self):
        """_summary_

        Args:
            param_name (_type_): _description_
            knowledge_id (_type_): _description_
            grad_self (_type_): _description_

        Returns:
            _type_: _description_
        """
        if param_name not in self.Knowledge_type_grad:
            return None
        grads_dict = self.Knowledge_type_grad[param_name]
        if knowledge_id not in grads_dict:
            return None
        grad_self = grad_self.to(self.compute_device).view(1, -1)  # shape: (1, D)
        other_grads = [
            grad.to(self.compute_device).view(-1) for k, grad in grads_dict.items() if k != knowledge_id
        ]
        if not other_grads:
            return None
        other_grads = torch.stack(other_grads, dim=0)  # shape: (N-1, D)
        cos_sims = F.cosine_similarity(grad_self, other_grads)  # shape: (N-1,)
        return cos_sims.abs().max().item()

class LLaVATrainer(Trainer):
    def __init__(self,num_virtual_tokens, save_dir,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.memory_tokens = None
        self.unlearn_weight = 1.0
        self.kl_weight = 0.5
        self.threshold = 0.05
        self.max_grad_norm = 0.05
        self.num_virtual_tokens = num_virtual_tokens
        self.task_length = None
        self.memory_tokens_init = None
        self.save_dir = save_dir
        print(f"save_dir: {self.save_dir}")
        # self.lora_enable = args.lora_enable if hasattr(args, 'lora_enable') else True
        # self.prompt_tuning_enable = args.prompt_tuning_enable if hasattr(args, 'prompt_tuning_enable') else True
        # self.lora_bias = args.lora_bias if hasattr(args, 'lora_bias') else "none"
        
        train_args = kwargs.get('args', None)
        if train_args is not None:
            self.lora_enable = getattr(train_args, 'lora_enable', True)
            self.prompt_tuning_enable = getattr(train_args, 'prompt_tuning_enable', True)
            self.lora_bias = getattr(train_args, 'lora_bias', "none")
            print(f"lora_enable: {self.lora_enable}, prompt_tuning_enable: {self.prompt_tuning_enable}, lora_bias: {self.lora_bias}")
        else:
            self.lora_enable = True
            self.prompt_tuning_enable = True
            self.lora_bias = "none"
        self.state_dicts = {}
        self.knowledge_grad = KnowledgeGrad(knowledge_length=self.args.knowledge_length,Threshold=self.args.Simlarity_Threshold)
        # self.Knowledge_Second_grad = {}
    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
        if self.train_dataset is None or not has_length(self.train_dataset):
            return None

        if self.args.group_by_modality_length:
            lengths = self.train_dataset.modality_lengths
            return LengthGroupedSampler(
                self.args.train_batch_size,
                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
                lengths=lengths,
                group_by_modality=True,
            )
        else:
            return super()._get_train_sampler()
    
    def training_step(self, model, inputs):
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (`nn.Module`):
                The model to train.
            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument `labels`. Check your model's documentation for all accepted arguments.

        Return:
            `torch.Tensor`: The tensor with training loss on this batch.
        """
        ####
        # print(inputs)
        
        
        inputs = self._prepare_inputs(inputs)
        task_ids = inputs['knowledge_id'][0]
        task_length = inputs['task_length'][0]
        print(f"task_length: {inputs['task_length']}, knowledge_id: {inputs['knowledge_id']}")
        task_mem_token_lenth = self.num_virtual_tokens / task_length
        if self.task_length is None:
            self.task_length = task_length
        # print(f"task_mem_token_lenth: {task_mem_token_lenth}")
        mem_index = int(task_ids * task_mem_token_lenth)
        # del inputs['knowledge_id']
        del inputs['mask']
        # with torch.no_grad():
        #     outputs = model(**inputs)
        ####
        outputs = None
        model.train()
        if is_sagemaker_mp_enabled():
            loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
            return loss_mb.reduce_mean().detach().to(self.args.device)
        # inputs["block_memory"] = True
        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)
        
        if self.args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training
        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss)

        for name, param in model.named_parameters():
            if "prompt_encoder" in name:
                MemToken_grad = deepspeed.utils.safe_get_full_grad(param)
                # print(f"MemToken_grad_before: {MemToken_grad.shape}, {MemToken_grad}")
                zero_mask = torch.zeros_like(MemToken_grad)
                start = int((task_ids-1) * task_mem_token_lenth)
                end = int(task_ids * task_mem_token_lenth)
                zero_mask[start:end, :] = 1
                MemToken_grad = MemToken_grad * zero_mask
                # MemToken_grad[int((task_ids-1) * task_mem_token_lenth):int(task_ids * task_mem_token_lenth), :] = 0
                # print(int((task_ids-1) * task_mem_token_lenth),int(task_ids * task_mem_token_lenth))
                deepspeed.utils.safe_set_full_grad(param,MemToken_grad)
            elif "lora" in name:
                grad = deepspeed.utils.safe_get_full_grad(param)
                grad = self.knowledge_grad.update_grad(name, int(task_ids), grad)
                deepspeed.utils.safe_set_full_grad(param,grad)
                # if not self.knowledge_grad.update_param(name, int(task_ids), grad):
                #     print(name, f"knowledge_id {task_ids} is not in the knowledge set, skip updating")
                #     deepspeed.utils.safe_set_full_grad(param,torch.zeros_like(grad))
                    
        return loss.detach() / self.args.gradient_accumulation_steps , outputs
    
    def training_step_unlearning(self, model, inputs, outputs):
        self.phrase="Donald Trump"
        self.model.train()
        # print(inputs.keys())
        # mask=inputs['mask']
        device = inputs["input_ids"].device
        # del inputs['mask']
        if "mask" in inputs:
            del inputs['mask']
        inputs = self._prepare_inputs(inputs)
        
        tokens = self.tokenizer.tokenize(self.phrase)
        # print("tokens:", tokens)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        mask_phr = torch.ones([32000], dtype=torch.float32).to(device)
        for id in token_ids:
            mask_phr[id]=0
        
        if is_sagemaker_mp_enabled():
            loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
            return loss_mb.reduce_mean().detach()
        
        inputs["block_memory"] = True
        knowledge_ids = inputs['knowledge_id'][0]
        del inputs['knowledge_id']
        del inputs['task_length']
        with self.compute_loss_context_manager():
            loss, unlearn_outputs = self.compute_loss(model, inputs,return_outputs=True)
        print(f"second stage loss without prune: {loss}")


        # loss = -loss
        loss = torch.clamp(-loss, min=-2.5, max=1.0)
        

        
        
        if self.args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss)
        print(f"second stage loss: {loss}")
        # debuging code:
        # if int(knowledge_ids) == 2:
        #     print("set grad to zero for all")
        #     for name, param in model.named_parameters():
        #         if param.requires_grad:
        #             deepspeed.utils.safe_set_full_grad(param,torch.zeros_like(deepspeed.utils.safe_get_full_grad(param)).to(param.device))
        #             # print(f"set {name} grad to zero")
        #             # print(f"param.grad after: {deepspeed.utils.safe_get_full_grad(param)}")
        # else:
        for name, param in model.named_parameters():
            if "lora" in name:
                grad = deepspeed.utils.safe_get_full_grad(param)
                if grad is not None:
                    grad_norm = grad.norm()
                    if grad_norm > self.threshold:
                        torch.nn.utils.clip_grad_norm_(param, self.max_grad_norm)
            if "prompt_encoder" in name:
                deepspeed.utils.safe_set_full_grad(param,torch.zeros_like(deepspeed.utils.safe_get_full_grad(param)).to(grad.device))

        return loss.detach() / self.args.gradient_accumulation_steps
    def training_step_withmem_unlearning(self, model, inputs, outputs,lr_with_mem):
        self.model.train()
        task_ids = inputs['knowledge_id'][0]
        task_length = inputs['task_length'][0]
        task_mem_token_lenth = self.num_virtual_tokens / task_length
        if self.task_length is None:
            self.task_length = task_length
        mem_index = int(task_ids * task_mem_token_lenth)
        del inputs['mask']
        inputs = self._prepare_inputs(inputs)
        if is_sagemaker_mp_enabled():
            loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)
            return loss_mb.reduce_mean().detach()
        inputs["block_memory"] = True
        # del inputs['knowledge_id']
        # del inputs['task_length']
        if lr_with_mem <0:
            print("lr_with_mem is less than 0, skip memory tokens")
            del inputs['knowledge_id']
            del inputs['task_length']
        with self.compute_loss_context_manager():
            loss, unlearn_outputs = self.compute_loss(model, inputs,return_outputs=True)
        print(f"third stage loss without prune: {loss}")
        loss = torch.clamp(-loss, min=-2.5, max=1.0)      
        if self.args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss)
        print(f"third stage loss: {loss}")
        # debuging code:
        
        for name, param in model.named_parameters():
            if "prompt_encoder" in name:
                # if lr_with_mem > 0:
                print("third_stage_grad",deepspeed.utils.safe_get_full_grad(param))
            elif "lora" in name:
                if lr_with_mem <0:
                    grad = deepspeed.utils.safe_get_full_grad(param)
                    deepspeed.utils.safe_set_full_grad(param,grad*1.1)
                # else:
                    
                # MemToken_grad = deepspeed.utils.safe_get_full_fp32_param(param)
                # MemToken_grad[int((task_ids-1) * task_mem_token_lenth):int(task_ids * task_mem_token_lenth), :] = 0
                # # print(f"MemToken_grad: {MemToken_grad.shape}, {MemToken_grad}")
                # deepspeed.utils.safe_set_full_grad(param,MemToken_grad)

        return loss.detach() / self.args.gradient_accumulation_steps
    def _inner_training_loop(
        self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
    ):
        self.accelerator.free_memory()
        self._train_batch_size = batch_size
        if self.args.auto_find_batch_size:
            if self.state.train_batch_size != self._train_batch_size:
                from accelerate.utils import release_memory

                (self.model_wrapped,) = release_memory(self.model_wrapped)
                self.model_wrapped = self.model

                # Check for DeepSpeed *after* the intial pass and modify the config
                if self.is_deepspeed_enabled:
                    # Temporarily unset `self.args.train_batch_size`
                    original_bs = self.args.per_device_train_batch_size
                    self.args.per_device_train_batch_size = self._train_batch_size // max(1, self.args.n_gpu)
                    self.propagate_args_to_deepspeed(True)
                    self.args.per_device_train_batch_size = original_bs
            self.state.train_batch_size = self._train_batch_size
        logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
        # Data loader and number of training steps
        train_dataloader = self.get_train_dataloader()

        # Setting up training control variables:
        # number of training epochs: num_train_epochs
        # number of training steps per epoch: num_update_steps_per_epoch
        # total number of training steps to execute: max_steps
        total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size

        len_dataloader = None
        num_train_tokens = None
        # import ipdb;ipdb.set_trace()
        if has_length(train_dataloader):
            len_dataloader = len(train_dataloader)
            num_update_steps_per_epoch = len_dataloader // args.gradient_accumulation_steps
            num_update_steps_per_epoch = max(num_update_steps_per_epoch, 1)
            num_examples = self.num_examples(train_dataloader)
            if args.max_steps > 0:
                max_steps = args.max_steps
                num_train_epochs = args.max_steps // num_update_steps_per_epoch + int(
                    args.max_steps % num_update_steps_per_epoch > 0
                )
                # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's
                # the best we can do.
                num_train_samples = args.max_steps * total_train_batch_size
                if args.include_tokens_per_second:
                    num_train_tokens = (
                        self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
                    )
            else:
                max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch)
                num_train_epochs = math.ceil(args.num_train_epochs)
                num_train_samples = self.num_examples(train_dataloader) * args.num_train_epochs
                if args.include_tokens_per_second:
                    num_train_tokens = self.num_tokens(train_dataloader) * args.num_train_epochs
        elif args.max_steps > 0:  # Rely on max_steps when dataloader does not have a working size
            max_steps = args.max_steps
            # Setting a very large number of epochs so we go as many times as necessary over the iterator.
            num_train_epochs = sys.maxsize
            num_update_steps_per_epoch = max_steps
            num_examples = total_train_batch_size * args.max_steps
            num_train_samples = args.max_steps * total_train_batch_size
            if args.include_tokens_per_second:
                num_train_tokens = self.num_tokens(train_dataloader, args.max_steps) * args.gradient_accumulation_steps
        else:
            raise ValueError(
                "args.max_steps must be set to a positive value if dataloader does not have a length, was"
                f" {args.max_steps}"
            )

        if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug:
            if self.args.n_gpu > 1:
                # nn.DataParallel(model) replicates the model, creating new variables and module
                # references registered here no longer work on other gpus, breaking the module
                raise ValueError(
                    "Currently --debug underflow_overflow is not supported under DP. Please use DDP"
                    " (torchrun or torch.distributed.launch (deprecated))."
                )
            else:
                debug_overflow = DebugUnderflowOverflow(self.model)  # noqa

        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
        
        # We need to reset the scheduler, as its parameters may be different on subsequent calls
        if self._created_lr_scheduler:
            self.lr_scheduler = None
            self._created_lr_scheduler = False

        if self.is_deepspeed_enabled:
            self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps)

        if not delay_optimizer_creation:
            self.create_optimizer_and_scheduler(num_training_steps=max_steps)

        self.state = TrainerState()
        self.state.is_hyper_param_search = trial is not None
        self.state.train_batch_size = self._train_batch_size

        # Compute absolute values for logging, eval, and save if given as ratio
        if args.logging_steps is not None:
            if args.logging_steps < 1:
                self.state.logging_steps = math.ceil(max_steps * args.logging_steps)
            else:
                self.state.logging_steps = args.logging_steps
        if args.eval_steps is not None:
            if args.eval_steps < 1:
                self.state.eval_steps = math.ceil(max_steps * args.eval_steps)
            else:
                self.state.eval_steps = args.eval_steps
        if args.save_steps is not None:
            if args.save_steps < 1:
                self.state.save_steps = math.ceil(max_steps * args.save_steps)
            else:
                self.state.save_steps = args.save_steps

        # Activate gradient checkpointing if needed
        if args.gradient_checkpointing:
            if args.gradient_checkpointing_kwargs is None:
                gradient_checkpointing_kwargs = {}
            else:
                gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs

            self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs)

        model = self._wrap_model(self.model_wrapped)
        
        # as the model is wrapped, don't use `accelerator.prepare`
        # this is for unhandled cases such as
        # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX
        use_accelerator_prepare = True if model is self.model else False

        if delay_optimizer_creation:
            self.create_optimizer_and_scheduler(num_training_steps=max_steps)
        # prepare using `accelerator` prepare
        if use_accelerator_prepare:
            self.model.train()
            if hasattr(self.lr_scheduler, "step"):
                if self.use_apex:
                    model = self.accelerator.prepare(self.model)
                else:
                    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
            else:
                # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
                model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
                    self.model, self.optimizer, self.lr_scheduler
                )
        # ...existing code...

        # if use_accelerator_prepare:
        #     # 保存 prompt_encoder 的参数
        #     prompt_encoder_state = None
        #     for name, module in self.model.named_modules():
        #         if "prompt_encoder" in name:
        #             prompt_encoder_state = {k: v.clone().detach() for k, v in module.state_dict().items()}
        #             prompt_encoder_module_name = name
        #             break
        #     self.model.train()
        #     if hasattr(self.lr_scheduler, "step"):
        #         if self.use_apex:
        #             model = self.accelerator.prepare(self.model)
        #         else:
        #             model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
        #             # for name, param in model.named_parameters():
        #             #     if "prompt_encoder" in name:
        #             #         print(f"if self.is_fsdp_enabled:):, {name}, {param}, param.grad{param.grad}")
        #             #         print(fuck)
        #     else:
        #         # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config.
        #         model, self.optimizer, self.lr_scheduler = self.accelerator.prepare(
        #             self.model, self.optimizer, self.lr_scheduler
        #         )

            # 将 prompt_encoder 的参数赋值回 model
            # if prompt_encoder_state is not None:
            #     # 获取 model 中的 prompt_encoder 模块
            #     module = model
            #     for attr in prompt_encoder_module_name.split('.'):
            #         module = getattr(module, attr)
            #     module.load_state_dict(prompt_encoder_state)
        
        
        if self.is_fsdp_enabled:
            self.model = self.model_wrapped = model

        # for the rest of this function `model` is the outside model, whether it was wrapped or not
        if model is not self.model:
            self.model_wrapped = model

        # backward compatibility
        if self.is_deepspeed_enabled:
            self.deepspeed = self.model_wrapped

        # ckpt loading
        if resume_from_checkpoint is not None:
            if self.is_deepspeed_enabled:
                deepspeed_load_checkpoint(self.model_wrapped, resume_from_checkpoint)
            elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled:
                self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped)

        # Check if saved optimizer or scheduler states exist
        self._load_optimizer_and_scheduler(resume_from_checkpoint)

        # important: at this point:
        # self.model         is the Transformers Model
        # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model),
        # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc.

        # Train!
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {num_examples:,}")
        logger.info(f"  Num Epochs = {num_train_epochs:,}")
        logger.info(f"  Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
        if self.args.per_device_train_batch_size != self._train_batch_size:
            logger.info(f"  Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
        logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
        logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
        logger.info(f"  Total optimization steps = {max_steps:,}")
        logger.info(f"  Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")

        self.state.epoch = 0
        start_time = time.time()
        epochs_trained = 0
        steps_trained_in_current_epoch = 0
        steps_trained_progress_bar = None

        # Check if continuing training from a checkpoint
        if resume_from_checkpoint is not None and os.path.isfile(
            os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)
        ):
            self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
            epochs_trained = self.state.global_step // num_update_steps_per_epoch
            if not args.ignore_data_skip:
                steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch)
                steps_trained_in_current_epoch *= args.gradient_accumulation_steps
            else:
                steps_trained_in_current_epoch = 0

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info(f"  Continuing training from epoch {epochs_trained}")
            logger.info(f"  Continuing training from global step {self.state.global_step}")
            if not args.ignore_data_skip:
                logger.info(
                    f"  Will skip the first {epochs_trained} epochs then the first"
                    f" {steps_trained_in_current_epoch} batches in the first epoch."
                )

        # Update the references
        self.callback_handler.model = self.model
        self.callback_handler.optimizer = self.optimizer
        self.callback_handler.lr_scheduler = self.lr_scheduler
        self.callback_handler.train_dataloader = train_dataloader
        if self.hp_name is not None and self._trial is not None:
            # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial
            # parameter to Train when using DDP.
            self.state.trial_name = self.hp_name(self._trial)
        if trial is not None:
            assignments = trial.assignments if self.hp_search_backend == HPSearchBackend.SIGOPT else trial
            self.state.trial_params = hp_params(assignments)
        else:
            self.state.trial_params = None
        # This should be the same if the state has been saved but in case the training arguments changed, it's safer
        # to set this after the load.
        self.state.max_steps = max_steps
        self.state.num_train_epochs = num_train_epochs
        self.state.is_local_process_zero = self.is_local_process_zero()
        self.state.is_world_process_zero = self.is_world_process_zero()

        # tr_loss is a tensor to avoid synchronization of TPUs through .item()
        tr_loss = torch.tensor(0.0).to(args.device)
        # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
        self._total_loss_scalar = 0.0
        self._globalstep_last_logged = self.state.global_step
        model.zero_grad()

        self.control = self.callback_handler.on_train_begin(args, self.state, self.control)

        # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
        if not args.ignore_data_skip:
            for epoch in range(epochs_trained):
                sampler = get_dataloader_sampler(train_dataloader)
                sampler_kinds = [RandomSampler]
                if version.parse(accelerate_version) > version.parse("0.23.0"):
                    sampler_kinds.append(SeedableRandomSampler)
                is_random_sampler = isinstance(sampler, tuple(sampler_kinds))
                if not is_random_sampler:
                    # We just need to begin an iteration to create the randomization of the sampler.
                    for _ in train_dataloader:
                        break
                else:
                    # Otherwise we need to call the whooooole sampler cause there is some random operation added
                    # AT THE VERY END!
                    sampler = sampler if sampler is not None else []
                    _ = list(sampler)
        
        total_batched_samples = 0
        if self.prompt_tuning_enable:
            import copy
            
            print("start to save memory tokens")
            self.memory_tokens_init = copy.deepcopy(model.get_prompt_embedding_to_save(adapter_name="default"))
            print(f"memory tokens shape is {self.memory_tokens_init.shape}")
            print(f"memory tokens  is {self.memory_tokens_init}")
            for i in range(8):
                print()
            print("finish getting memory tokens")
            save_split_prompt_embeddings_init(self.memory_tokens_init, 2, self.save_dir)
        for epoch in range(epochs_trained, num_train_epochs):
            epoch_iterator = train_dataloader
            if hasattr(epoch_iterator, "set_epoch"):
                epoch_iterator.set_epoch(epoch)
            
            # Reset the past mems state at the beginning of each epoch if necessary.
            if args.past_index >= 0:
                self._past = None

            steps_in_epoch = (
                len(epoch_iterator)
                if len_dataloader is not None
                else args.max_steps * args.gradient_accumulation_steps
            )
            self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)

            if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0:
                self._load_rng_state(resume_from_checkpoint)

            rng_to_sync = False
            steps_skipped = 0
            if steps_trained_in_current_epoch > 0:
                epoch_iterator = skip_first_batches(epoch_iterator, steps_trained_in_current_epoch)
                steps_skipped = steps_trained_in_current_epoch
                steps_trained_in_current_epoch = 0
                rng_to_sync = True

            step = -1
            print("epoch_iterator",epoch_iterator)
            lr_with_mem = 1
            baseline = 1
            for step, inputs in enumerate(epoch_iterator):
                total_batched_samples += 1
    
                if self.args.include_num_input_tokens_seen:
                    main_input_name = getattr(self.model, "main_input_name", "input_ids")
                    if main_input_name not in inputs:
                        logger.warning(
                            "Tried to track the number of tokens seen, however the current model is "
                            "not configured properly to know what item is the input. To fix this, add "
                            "a `main_input_name` attribute to the model class you are using."
                        )
                    else:
                        self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()
                if rng_to_sync:
                    self._load_rng_state(resume_from_checkpoint)
                    rng_to_sync = False

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    if steps_trained_progress_bar is not None:
                        steps_trained_progress_bar.update(1)
                    if steps_trained_in_current_epoch == 0:
                        self._load_rng_state(resume_from_checkpoint)
                    continue
                elif steps_trained_progress_bar is not None:
                    steps_trained_progress_bar.close()
                    steps_trained_progress_bar = None

                if step % args.gradient_accumulation_steps == 0:
                    self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
                outputs = None
                # if int(inputs['knowledge_id'][0]) == 1:
                # if random.random() < 0.5: 
                baseline = - baseline
                with self.accelerator.accumulate(model):
                    # for name, param in model.named_parameters():
                    #     if "prompt_encoder" in name or "lora" in name:
                    #         print(f" {name}, {param}, param.grad{param.grad}")
                    # print("fuck")
                    if baseline > 0: 
                        tr_loss_step,outputs = self.training_step(model, inputs)
                    else:
                        tr_loss_step = self.training_step_unlearning(model, inputs, outputs)
                if (
                    args.logging_nan_inf_filter
                    and not is_torch_tpu_available()
                    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                ):
                    # if loss is nan or inf simply add the average of previous logged losses
                    tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
                else:
                    tr_loss += tr_loss_step

                self.current_flos += float(self.floating_point_ops(inputs))

                is_last_step_and_steps_less_than_grad_acc = (
                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
                )

                if (
                    total_batched_samples % args.gradient_accumulation_steps == 0
                    or
                    # last step in epoch but step is always smaller than gradient_accumulation_steps
                    is_last_step_and_steps_less_than_grad_acc
                ):
                    # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
                    # in accelerate. So, explicitly enable sync gradients to True in that case.
                    if is_last_step_and_steps_less_than_grad_acc:
                        self.accelerator.gradient_state._set_sync_gradients(True)

                    # Gradient clipping
                    if args.max_grad_norm is not None and args.max_grad_norm > 0:
                        # deepspeed does its own clipping

                        if is_sagemaker_mp_enabled() and args.fp16:
                            self.optimizer.clip_master_grads(args.max_grad_norm)
                        elif self.use_apex:
                            # Revert to normal clipping otherwise, handling Apex or full precision
                            nn.utils.clip_grad_norm_(
                                amp.master_params(self.optimizer),
                                args.max_grad_norm,
                            )
                        else:
                            self.accelerator.clip_grad_norm_(
                                model.parameters(),
                                args.max_grad_norm,
                            )
                    print(f"first_stage loss is {tr_loss}")
                    # Optimizer step
                    self.optimizer.step()
                    optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
                    # ##
                ###################################
                # else:
                #     with self.accelerator.accumulate(model):
                #         tr_loss_step = self.training_step_withmem_unlearning(model, inputs,outputs,lr_with_mem)

                #     if (
                #         args.logging_nan_inf_filter
                #         and not is_torch_tpu_available()
                #         and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                #     ):
                #         # if loss is nan or inf simply add the average of previous logged losses
                #         tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
                #     else:
                #         tr_loss += tr_loss_step
                    
                #     self.current_flos += float(self.floating_point_ops(inputs))

                #     is_last_step_and_steps_less_than_grad_acc = (
                #         steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
                #     )

                #     if (
                #         total_batched_samples % args.gradient_accumulation_steps == 0
                #         or
                #         # last step in epoch but step is always smaller than gradient_accumulation_steps
                #         is_last_step_and_steps_less_than_grad_acc
                #     ):
                #         # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
                #         # in accelerate. So, explicitly enable sync gradients to True in that case.
                #         if is_last_step_and_steps_less_than_grad_acc:
                #             self.accelerator.gradient_state._set_sync_gradients(True)

                #         # Gradient clipping
                #         if args.max_grad_norm is not None and args.max_grad_norm > 0:
                #             # deepspeed does its own clipping

                #             if is_sagemaker_mp_enabled() and args.fp16:
                #                 self.optimizer.clip_master_grads(args.max_grad_norm)
                #             elif self.use_apex:
                #                 # Revert to normal clipping otherwise, handling Apex or full precision
                #                 nn.utils.clip_grad_norm_(
                #                     amp.master_params(self.optimizer),
                #                     args.max_grad_norm,
                #                 )
                #             else:
                #                 self.accelerator.clip_grad_norm_(
                #                     model.parameters(),
                #                     args.max_grad_norm,
                #                 )

                #         # Optimizer step
                #         self.optimizer.step()
                #         optimizer_was_run = not self.accelerator.optimizer_step_was_skipped
                    if optimizer_was_run:
                        # Delay optimizer scheduling until metrics are generated
                        if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                            self.lr_scheduler.step()

                    model.zero_grad()
                    self.state.global_step += 1
                    self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch
                    self.control = self.callback_handler.on_step_end(args, self.state, self.control)

                    self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
                else:
                    self.control = self.callback_handler.on_substep_end(args, self.state, self.control)
                lr_with_mem = -lr_with_mem
                if self.control.should_epoch_stop or self.control.should_training_stop:
                    break
                
            if step < 0:
                logger.warning(
                    "There seems to be not a single sample in your epoch_iterator, stopping training at step"
                    f" {self.state.global_step}! This is expected if you're using an IterableDataset and set"
                    f" num_steps ({max_steps}) higher than the number of available samples."
                )
                self.control.should_training_stop = True

            self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
            self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)

            if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
                if is_torch_tpu_available():
                    # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
                    xm.master_print(met.metrics_report())
                else:
                    logger.warning(
                        "You enabled PyTorch/XLA debug metrics but you don't have a TPU "
                        "configured. Check your training configuration if this is unexpected."
                    )
            if self.control.should_training_stop:
                break
        ###
        if self.prompt_tuning_enable:
            print("start to save memory tokens")
            self.memory_tokens = model.get_prompt_embedding_to_save(adapter_name="default")
            print(f"memory tokens shape is {self.memory_tokens.shape}")
            print("finish getting memory tokens")
        print("start to save parmaeters")
        from llava.train.save_parm import get_peft_state_maybe_zero_3, get_prompt_tuning_state_maybe_zero_3, get_peft_state_non_lora_prompt_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3, get_peft_state_non_prompt_maybe_zero_3
        if self.lora_enable:
            self.state_dicts['lora'] = get_peft_state_maybe_zero_3(
                model.named_parameters(), self.lora_bias
            )
            
        if self.prompt_tuning_enable:
            self.state_dicts['prompt'] = get_prompt_tuning_state_maybe_zero_3(
                model.named_parameters()
            )
            
        if self.prompt_tuning_enable and self.lora_enable:
            self.state_dicts['non_lora_prompt'] = get_peft_state_non_lora_prompt_maybe_zero_3(
                model.named_parameters()
            )

        if self.lora_enable:
            self.state_dicts['non_lora'] = get_peft_state_non_lora_maybe_zero_3(
                model.named_parameters()
            )

        if self.prompt_tuning_enable:
            self.state_dicts['non_prompt'] = get_peft_state_non_prompt_maybe_zero_3(
                model.named_parameters()
            )
        ###
        if args.past_index and hasattr(self, "_past"):
            # Clean the state at the end of training
            delattr(self, "_past")

        logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n")
        if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
            # Wait for everyone to get here so we are sure the model has been saved by process 0.
            if is_torch_tpu_available():
                xm.rendezvous("load_best_model_at_end")
            elif args.parallel_mode == ParallelMode.DISTRIBUTED:
                dist.barrier()
            elif is_sagemaker_mp_enabled():
                smp.barrier()

            self._load_best_model()

        # add remaining tr_loss
        self._total_loss_scalar += tr_loss.item()
        train_loss = self._total_loss_scalar / self.state.global_step

        metrics = speed_metrics(
            "train",
            start_time,
            num_samples=num_train_samples,
            num_steps=self.state.max_steps,
            num_tokens=num_train_tokens,
        )
        self.store_flos()
        metrics["total_flos"] = self.state.total_flos
        metrics["train_loss"] = train_loss

        self.is_in_train = False

        self._memory_tracker.stop_and_update_metrics(metrics)

        self.log(metrics)

        run_dir = self._get_output_dir(trial)
        checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir)

        # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save.
        if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1:
            for checkpoint in checkpoints_sorted:
                if not os.path.samefile(checkpoint, self.state.best_model_checkpoint):
                    logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
                    shutil.rmtree(checkpoint)

        self.control = self.callback_handler.on_train_end(args, self.state, self.control)

        # Wait for the checkpoint to be uploaded.
        self._finish_current_push()

        # After training we make sure to retrieve back the original forward pass method
        # for the embedding layer by removing the forward post hook.
        if self.neftune_noise_alpha is not None:
            self._deactivate_neftune(self.model)

        return TrainOutput(self.state.global_step, train_loss, metrics)
    def create_optimizer(self):
        """
        Setup the optimizer.

        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
        """
        if is_sagemaker_mp_enabled():
            return super().create_optimizer()

        opt_model = self.model

        if self.optimizer is None:
            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
            decay_parameters = [name for name in decay_parameters if "bias" not in name]
            if self.args.mm_projector_lr is not None:
                projector_parameters = [name for name, _ in opt_model.named_parameters() if "mm_projector" in name]
                optimizer_grouped_parameters = [
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in projector_parameters and p.requires_grad)
                        ],
                        "weight_decay": self.args.weight_decay,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in projector_parameters and p.requires_grad)
                        ],
                        "weight_decay": 0.0,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in projector_parameters and p.requires_grad)
                        ],
                        "weight_decay": self.args.weight_decay,
                        "lr": self.args.mm_projector_lr,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in projector_parameters and p.requires_grad)
                        ],
                        "weight_decay": 0.0,
                        "lr": self.args.mm_projector_lr,
                    },
                ]
            else:
                optimizer_grouped_parameters = [
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
                        ],
                        "weight_decay": self.args.weight_decay,
                    },
                    {
                        "params": [
                            p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)
                        ],
                        "weight_decay": 0.0,
                    },
                ]

            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)

            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
            if optimizer_cls.__name__ == "Adam8bit":
                import bitsandbytes

                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()

                skipped = 0
                for module in opt_model.modules():
                    if isinstance(module, nn.Embedding):
                        skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
                        logger.info(f"skipped {module}: {skipped/2**20}M params")
                        manager.register_module_override(module, "weight", {"optim_bits": 32})
                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
                logger.info(f"skipped: {skipped/2**20}M params")

        return self.optimizer

    def _save_checkpoint(self, model, trial, metrics=None):
        if getattr(self.args, 'tune_mm_mlp_adapter', False):
            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"

            run_dir = self._get_output_dir(trial=trial)
            output_dir = os.path.join(run_dir, checkpoint_folder)

            # Only save Adapter
            keys_to_match = ['mm_projector', 'vision_resampler']
            if getattr(self.args, "use_im_start_end", False):
                keys_to_match.extend(['embed_tokens', 'embed_in'])

            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)

            if self.args.local_rank == 0 or self.args.local_rank == -1:
                self.model.config.save_pretrained(output_dir)
                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
        else:
            super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)

    def _save(self, output_dir: Optional[str] = None, state_dict=None):
        if getattr(self.args, 'tune_mm_mlp_adapter', False):
            pass
        else:
            super(LLaVATrainer, self)._save(output_dir, state_dict)
    
    def get_train_dataloader(self) -> DataLoader:
        """
        Returns the training [`~torch.utils.data.DataLoader`].

        Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
        training if necessary) otherwise.

        Subclass and override this method if you want to inject some custom behavior.
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataset = self.train_dataset
        data_collator = self.data_collator
        if is_datasets_available() and isinstance(train_dataset, datasets.Dataset):
            train_dataset = self._remove_unused_columns(train_dataset, description="training")
        else:
            data_collator = self._get_collator_with_removed_columns(data_collator, description="training")

        dataloader_params = {
            "batch_size": self._train_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = self._get_train_sampler()
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker

        # return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
        
        return self.accelerator.prepare(KnowledgeIdDataLoader(train_dataset, **dataloader_params))