import numpy as np 
import torch
import torch.nn as nn

import copy
from torchvision import transforms
from PIL import Image
import torch.nn.functional as F

import time
import random

class SGAttacker():
    def __init__(self, model, img_attacker, txt_attacker):
        self.model=model
        self.img_attacker = img_attacker
        self.txt_attacker = txt_attacker

    
    def attack(self, imgs, txts, txt2img, device='cpu', max_length=30, scales=None, **kwargs):
    
        # original state
        with torch.no_grad():
            origin_img_output = self.model.inference_image(self.img_attacker.normalization(imgs))
            # print(origin_img_output['image_feat'].shape)  # [8, 256]
            img_supervisions = origin_img_output['image_feat'][txt2img] 
            # print(txt2img)
            # print(img_supervisions.shape)   # [40, 256]
        adv_txts = self.txt_attacker.img_guided_attack(self.model, txts, img_embeds=img_supervisions)

        with torch.no_grad():
            txts_input = self.txt_attacker.tokenizer(adv_txts, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt").to(device)
            txts_output = self.model.inference_text(txts_input)
            txt_supervisions = txts_output['text_feat']
        adv_imgs = self.img_attacker.txt_guided_attack(self.model, imgs, txt2img, device, 
                                                       scales=scales, txt_embeds = txt_supervisions)
        
        with torch.no_grad():
            adv_imgs_outputs = self.model.inference_image(self.img_attacker.normalization(adv_imgs))
            img_supervisions = adv_imgs_outputs['image_feat'][txt2img]
        adv_txts = self.txt_attacker.img_guided_attack(self.model, txts, img_embeds=img_supervisions)
        
        # print(imgs.shape, len(txts), len(txt2img))    # torch.Size([8, 3, 384, 384]) 40 40
        # print(adv_imgs.shape, len(adv_txts))          # torch.Size([8, 3, 384, 384]) 40
                            
        return adv_imgs, adv_txts

                

class ImageAttacker():
    def __init__(self, normalization, eps=2/255, steps=10, step_size=0.5/255):
        self.normalization = normalization
        self.eps = eps
        self.steps = steps 
        self.step_size = step_size 
        self.max_iter = 100

    def loss_func(self, adv_imgs_embeds, txts_embeds, txt2img):  
        device = adv_imgs_embeds.device    

        it_sim_matrix = adv_imgs_embeds @ txts_embeds.T
        it_labels = torch.zeros(it_sim_matrix.shape).to(device)
        
        for i in range(len(txt2img)):
            it_labels[txt2img[i], i]=1 
        
        loss_temp = it_sim_matrix * it_labels
        # # print(loss_temp)
        # for i in range(loss_temp.size(0)):
        #     start_idx = i * 5
        #     end_idx = start_idx + 5
        #     if end_idx > loss_temp.size(1):
        #         end_idx = loss_temp.size(1)
        #     subsequence = loss_temp[i, start_idx:end_idx]
        #     if len(subsequence) < 2:
        #         continue
        #     min_vals, min_idxs = torch.topk(subsequence, 2, largest=False)
        #     loss_temp[i, start_idx + min_idxs[0]] = 0
        #     loss_temp[i, start_idx + min_idxs[1]] = 0
        # # print(loss_temp)
            
        loss_IaTcpos = -(loss_temp).sum(-1).mean()
        loss = loss_IaTcpos
        
        return loss
    
   
    def txt_guided_attack(self, model, imgs, txt2img, device, scales=None, txt_embeds=None):
        
        model.eval()
       
        b, _, _, _ = imgs.shape
        
        if scales is None:
            scales_num = 1
        else:
            scales_num = len(scales) +1
            
        # print(scales)

        adv_imgs = imgs.detach() + torch.from_numpy(np.random.uniform(-self.eps, self.eps, imgs.shape)).float().to(device)
        adv_imgs = torch.clamp(adv_imgs, 0.0, 1.0)
        
        prev_adv_imgs_embeds = None  # 初始化
        orig_imgs_embeds = model.inference_image(self.normalization(adv_imgs))['image_feat']
        
        # for i in range(self.steps):
        i = 0
        is_update = 0
        round_counter = 0
        while(i<self.steps):
            adv_imgs.requires_grad_()
            scales = self.get_random_scales(i)
            scales_num = len(scales) + 1
            # print(scales, scales_num)
            scaled_imgs = self.get_scaled_imgs(adv_imgs, scales, device)   
            # print(adv_imgs.shape, scaled_imgs.shape)    # torch.Size([8, 3, 224, 224]) torch.Size([40, 3, 224, 224]) 
        
            if self.normalization is not None:
                adv_imgs_output = model.inference_image(self.normalization(scaled_imgs))
            else:
                adv_imgs_output = model.inference_image(scaled_imgs)
                
            adv_imgs_embeds = adv_imgs_output['image_feat']
            model.zero_grad()
            with torch.enable_grad():
                loss_list = []
                loss = torch.tensor(0.0, dtype=torch.float32).to(device)
                for g in range(scales_num): # 5
                    loss_item = self.loss_func(adv_imgs_embeds[g*b:g*b+b], txt_embeds, txt2img)
                    # loss_item = self.optimal_transport_loss(adv_imgs_embeds[i*b:i*b+b], txt_embeds, txt2img)
                    loss_list.append(loss_item.item())
                    loss += loss_item
            loss.backward()
            
            # grad = adv_imgs.grad 
            # grad = grad / torch.mean(torch.abs(grad), dim=(1,2,3), keepdim=True)           
            
            # perturbation = self.step_size * grad.sign()
            # adv_imgs = adv_imgs.detach() + perturbation
            # adv_imgs = torch.min(torch.max(adv_imgs, imgs - self.eps), imgs + self.eps)
            # adv_imgs = torch.clamp(adv_imgs, 0.0, 1.0)
            
            grad = adv_imgs.grad 
            grad = grad / torch.mean(torch.abs(grad), dim=(1, 2, 3), keepdim=True)           
            
            perturbation = self.step_size * grad.sign()
            new_adv_imgs = adv_imgs.detach() + perturbation
            new_adv_imgs = torch.min(torch.max(new_adv_imgs, imgs - self.eps), imgs + self.eps)
            new_adv_imgs = torch.clamp(new_adv_imgs, 0.0, 1.0)

            if self.normalization is not None:
                new_adv_imgs_output = model.inference_image(self.normalization(new_adv_imgs))
            else:
                new_adv_imgs_output = model.inference_image(new_adv_imgs)
            new_adv_imgs_embeds = new_adv_imgs_output['image_feat']
            
            if prev_adv_imgs_embeds is None:
                prev_adv_imgs_embeds = model.inference_image(self.normalization(adv_imgs))['image_feat']

            diff_new = torch.norm(new_adv_imgs_embeds - orig_imgs_embeds, p=2)
            diff_current = torch.norm(prev_adv_imgs_embeds - orig_imgs_embeds, p=2)
            
            if (i%4==0 and i>0):
                diff_temp = torch.norm(new_adv_imgs_embeds - prev_adv_imgs_embeds, p=2)
                if is_update == 0:
                    diff_temp2 = torch.norm(prev_adv_imgs_embeds - orig_imgs_embeds, p=2)
                    is_update = 1
                if diff_temp < diff_temp2:  # 差异不够，回退
                    i = i-3
                    
                else:                       # 差异足够，前进
                    prev_adv_imgs_embeds = new_adv_imgs_embeds.detach().clone()     # 标记现在的特征
                    is_update = 0                                                   # 标记下一次计算

            # if diff_new > diff_current:
            #     adv_imgs = new_adv_imgs
            #     prev_adv_imgs_embeds = new_adv_imgs_embeds.detach().clone()
            #     i = i + 1
            adv_imgs = new_adv_imgs
            i = i + 1
            if i<self.steps:
                round_counter = round_counter+1
            else:
                print(round_counter)
                round_counter = 0
            # print(i)
            
        return adv_imgs

    def get_random_scales(self, n):
        possible_scales = [i/100.0 for i in range(25, 301)]
        scales = random.sample(possible_scales, n)
        return scales

    def get_scaled_imgs(self, imgs, scales=None, device='cuda'):
        if scales is None:
            return imgs

        ori_shape = (imgs.shape[-2], imgs.shape[-1])
        
        reverse_transform = transforms.Resize(ori_shape,
                                interpolation=transforms.InterpolationMode.BICUBIC)
        result = []
        for ratio in scales:
            scale_shape = (int(ratio*ori_shape[0]), 
                                  int(ratio*ori_shape[1]))
            scale_transform = transforms.Resize(scale_shape,
                                  interpolation=transforms.InterpolationMode.BICUBIC)
            scaled_imgs = imgs + torch.from_numpy(np.random.normal(0.0, 0.05, imgs.shape)).float().to(device)
            scaled_imgs = scale_transform(scaled_imgs)
            scaled_imgs = torch.clamp(scaled_imgs, 0.0, 1.0)
            
            reversed_imgs = reverse_transform(scaled_imgs)
            
            result.append(reversed_imgs)
        
        return torch.cat([imgs,]+result, 0)
    
    def optimal_transport_loss(self, img_embeds, txt_embeds, txt2img):
        device = img_embeds.device

        it_sim_matrix = img_embeds @ txt_embeds.T
        it_labels = torch.zeros(it_sim_matrix.shape).to(device)

        for i in range(len(txt2img)):
            it_labels[txt2img[i], i] = 1

        # cost_matrix = -it_sim_matrix
        cost_matrix = -it_sim_matrix * it_labels
        # cost_matrix = torch.exp(cost_matrix)

        N = img_embeds.shape[0]  # 图像嵌入的数量
        M = txt_embeds.shape[0]  # 文本嵌入的数量

        u = torch.ones(N, device=device) / N  # 源分布的边际概率向量
        v = torch.ones(M, device=device) / M  # 目标分布的边际概率向量

        transport_plan = self.Sinkhorn(cost_matrix, u, v)
        
        scale_factor = 10

        loss = torch.sum(transport_plan * cost_matrix) * scale_factor
        return loss


    
    def Sinkhorn(self, K, u, v):
        r = torch.ones_like(u)
        c = torch.ones_like(v)
        thresh = 1e-2
        for i in range(self.max_iter):
            r0 = r
            r = u / torch.matmul(K, c.unsqueeze(-1)).squeeze(-1)
            c = v / torch.matmul(K.t(), r.unsqueeze(-1)).squeeze(-1)
            err = (r - r0).abs().mean()
            if err.item() < thresh:
                break

        T = torch.matmul(r.unsqueeze(-1), c.unsqueeze(-2)) * K

        return T




filter_words = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost',
                'alone', 'along', 'already', 'also', 'although', 'am', 'among', 'amongst', 'an', 'and', 'another',
                'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', "aren't", 'around', 'as',
                'at', 'back', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides',
                'between', 'beyond', 'both', 'but', 'by', 'can', 'cannot', 'could', 'couldn', "couldn't", 'd', 'didn',
                "didn't", 'doesn', "doesn't", 'don', "don't", 'down', 'due', 'during', 'either', 'else', 'elsewhere',
                'empty', 'enough', 'even', 'ever', 'everyone', 'everything', 'everywhere', 'except', 'first', 'for',
                'former', 'formerly', 'from', 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'he', 'hence',
                'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
                'how', 'however', 'hundred', 'i', 'if', 'in', 'indeed', 'into', 'is', 'isn', "isn't", 'it', "it's",
                'its', 'itself', 'just', 'latter', 'latterly', 'least', 'll', 'may', 'me', 'meanwhile', 'mightn',
                "mightn't", 'mine', 'more', 'moreover', 'most', 'mostly', 'must', 'mustn', "mustn't", 'my', 'myself',
                'namely', 'needn', "needn't", 'neither', 'never', 'nevertheless', 'next', 'no', 'nobody', 'none',
                'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'on', 'once', 'one', 'only',
                'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'per',
                'please', 's', 'same', 'shan', "shan't", 'she', "she's", "should've", 'shouldn', "shouldn't", 'somehow',
                'something', 'sometime', 'somewhere', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs',
                'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein',
                'thereupon', 'these', 'they', 'this', 'those', 'through', 'throughout', 'thru', 'thus', 'to', 'too',
                'toward', 'towards', 'under', 'unless', 'until', 'up', 'upon', 'used', 've', 'was', 'wasn', "wasn't",
                'we', 'were', 'weren', "weren't", 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
                'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while',
                'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'with', 'within', 'without', 'won',
                "won't", 'would', 'wouldn', "wouldn't", 'y', 'yet', 'you', "you'd", "you'll", "you're", "you've",
                'your', 'yours', 'yourself', 'yourselves', '.', '-', 'a the', '/', '?', 'some', '"', ',', 'b', '&', '!',
                '@', '%', '^', '*', '(', ')', "-", '-', '+', '=', '<', '>', '|', ':', ";", '～', '·']
filter_words = set(filter_words)
    

class TextAttacker():
    def __init__(self, ref_net, tokenizer, cls=True, max_length=30, number_perturbation=1, topk=10, threshold_pred_score=0.3, batch_size=32):
        self.ref_net = ref_net
        self.tokenizer = tokenizer
        self.max_length = max_length
        # epsilon_txt
        self.num_perturbation = number_perturbation
        self.threshold_pred_score = threshold_pred_score
        self.topk = topk
        self.batch_size = batch_size
        self.cls = cls
        self.max_iter = 100

    def img_guided_attack(self, net, texts, img_embeds = None):
        device = self.ref_net.device

        text_inputs = self.tokenizer(texts, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt').to(device)

        # substitutes
        mlm_logits = self.ref_net(text_inputs.input_ids, attention_mask=text_inputs.attention_mask).logits
        word_pred_scores_all, word_predictions = torch.topk(mlm_logits, self.topk, -1)  # seq-len k

        # original state
        origin_output = net.inference_text(text_inputs)
        if self.cls:
            origin_embeds = origin_output['text_feat'][:, 0, :].detach()
        else:
            origin_embeds = origin_output['text_feat'].flatten(1).detach()

        final_adverse = []
        for i, text in enumerate(texts):
            # word importance eval
            important_scores = self.get_important_scores(text, net, origin_embeds[i], self.batch_size, self.max_length)

            list_of_index = sorted(enumerate(important_scores), key=lambda x: x[1], reverse=True)

            words, sub_words, keys = self._tokenize(text)
            final_words = copy.deepcopy(words)
            change = 0

            for top_index in list_of_index:
                if change >= self.num_perturbation:
                    break

                tgt_word = words[top_index[0]]
                if tgt_word in filter_words:
                    continue
                if keys[top_index[0]][0] > self.max_length - 2:
                    continue

                substitutes = word_predictions[i, keys[top_index[0]][0]:keys[top_index[0]][1]]  # L, k
                word_pred_scores = word_pred_scores_all[i, keys[top_index[0]][0]:keys[top_index[0]][1]]

                substitutes = get_substitues(substitutes, self.tokenizer, self.ref_net, 1, word_pred_scores,
                                             self.threshold_pred_score)


                replace_texts = [' '.join(final_words)]
                available_substitutes = [tgt_word]
                for substitute_ in substitutes:
                    substitute = substitute_

                    if substitute == tgt_word:
                        continue  # filter out original word
                    if '##' in substitute:
                        continue  # filter out sub-word

                    if substitute in filter_words:
                        continue
                    '''
                    # filter out atonyms
                    if substitute in w2i and tgt_word in w2i:
                        if cos_mat[w2i[substitute]][w2i[tgt_word]] < 0.4:
                            continue
                    '''
                    temp_replace = copy.deepcopy(final_words)
                    temp_replace[top_index[0]] = substitute
                    available_substitutes.append(substitute)
                    replace_texts.append(' '.join(temp_replace))
                replace_text_input = self.tokenizer(replace_texts, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt').to(device)
                replace_output = net.inference_text(replace_text_input)
                if self.cls:
                    replace_embeds = replace_output['text_feat'][:, 0, :]
                else:
                    replace_embeds = replace_output['text_feat'].flatten(1)

                loss = self.loss_func(replace_embeds, img_embeds, i)
                # print("loss1: " + str(self.loss_func(replace_embeds, img_embeds, i)))
                # loss = self.optimal_transport_loss(replace_embeds, img_embeds, i)
                # print("loss2: " + str(loss))
                candidate_idx = loss.argmax()
                # print(candidate_idx)

                final_words[top_index[0]] = available_substitutes[candidate_idx]

                if available_substitutes[candidate_idx] != tgt_word:
                    change += 1

            final_adverse.append(' '.join(final_words))

        return final_adverse

    def loss_func(self, txt_embeds, img_embeds, label):
        loss_TaIcpos = -txt_embeds.mul(img_embeds[label].repeat(len(txt_embeds), 1)).sum(-1) 
        # print(loss_TaIcpos)
        loss = loss_TaIcpos
        return loss
    
    def optimal_transport_loss(self, txt_embeds, img_embeds, label):
        device = txt_embeds.device

        # 根据标签选择特定的图像嵌入
        selected_img_embeds = img_embeds[label].unsqueeze(0)

        # 计算成本矩阵，这里使用负的内积作为成本
        cost_matrix = -torch.matmul(txt_embeds, selected_img_embeds.T)
        cost_matrix = torch.exp(-cost_matrix)

        N = txt_embeds.shape[0]  # 文本嵌入的数量
        M = selected_img_embeds.shape[0]  # 选择的图像嵌入的数量

        u = torch.ones(N, device=device) / N  # 源分布的边际概率向量
        v = torch.ones(M, device=device) / M  # 目标分布的边际概率向量

        transport_plan = self.Sinkhorn(cost_matrix, u, v)

        # 计算最优传输损失
        loss = torch.sum(transport_plan * cost_matrix)
        return loss


    def Sinkhorn(self, K, u, v, max_iter=100, thresh=1e-2):
        r = torch.ones_like(u)
        c = torch.ones_like(v)
        for i in range(max_iter):
            r0 = r
            r = u / torch.matmul(K, c.unsqueeze(-1)).squeeze(-1)
            c = v / torch.matmul(K.t(), r.unsqueeze(-1)).squeeze(-1)
            err = (r - r0).abs().mean()
            if err.item() < thresh:
                break
        T = torch.matmul(r.unsqueeze(-1), c.unsqueeze(-2)) * K
        return T

    # def optimal_transport_loss(self, text_embeds, img_embed):
    #     cost_matrix = self.compute_cost_matrix(text_embeds, img_embed.unsqueeze(0))
    #     u = torch.ones(text_embeds.shape[0], device=text_embeds.device)
    #     v = torch.ones(1, device=img_embed.device)
    #     transport_plan = self.Sinkhorn(cost_matrix, u, v)
    #     loss = torch.sum(transport_plan * cost_matrix, dim=1)
    #     return loss
    
    # def compute_cost_matrix(self, txt_embeds, img_embeds):
    #     dot_products = torch.matmul(txt_embeds, img_embeds.t())
    #     cost_matrix = dot_products
    #     return cost_matrix
    
    # def Sinkhorn(self, K, u, v):
    #     r = torch.ones_like(u)
    #     c = torch.ones_like(v)
    #     thresh = 1e-2
    #     for i in range(self.max_iter):
    #         r0 = r
    #         r = u / torch.matmul(K, c.unsqueeze(-1)).squeeze(-1)
    #         c = v / torch.matmul(K.t(), r.unsqueeze(-1)).squeeze(-1)
    #         err = (r - r0).abs().mean()
    #         if err.item() < thresh:
    #             break

    #     T = torch.matmul(r.unsqueeze(-1), c.unsqueeze(-2)) * K

    #     return T


    def attack(self, net, texts):
        device = self.ref_net.device

        text_inputs = self.tokenizer(texts, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt').to(device)

        # substitutes
        mlm_logits = self.ref_net(text_inputs.input_ids, attention_mask=text_inputs.attention_mask).logits
        word_pred_scores_all, word_predictions = torch.topk(mlm_logits, self.topk, -1)  # seq-len k

        # original state
        origin_output = net.inference_text(text_inputs)
        if self.cls:
            origin_embeds = origin_output['text_embed'][:, 0, :].detach()
        else:
            origin_embeds = origin_output['text_embed'].flatten(1).detach()

        criterion = torch.nn.KLDivLoss(reduction='none')
        final_adverse = []
        for i, text in enumerate(texts):
            # word importance eval
            important_scores = self.get_important_scores(text, net, origin_embeds[i], self.batch_size, self.max_length)

            list_of_index = sorted(enumerate(important_scores), key=lambda x: x[1], reverse=True)

            words, sub_words, keys = self._tokenize(text)
            final_words = copy.deepcopy(words)
            change = 0

            for top_index in list_of_index:
                if change >= self.num_perturbation:
                    break

                tgt_word = words[top_index[0]]
                if tgt_word in filter_words:
                    continue
                if keys[top_index[0]][0] > self.max_length - 2:
                    continue

                substitutes = word_predictions[i, keys[top_index[0]][0]:keys[top_index[0]][1]]  # L, k
                word_pred_scores = word_pred_scores_all[i, keys[top_index[0]][0]:keys[top_index[0]][1]]

                substitutes = get_substitues(substitutes, self.tokenizer, self.ref_net, 1, word_pred_scores,
                                             self.threshold_pred_score)


                replace_texts = [' '.join(final_words)]
                available_substitutes = [tgt_word]
                for substitute_ in substitutes:
                    substitute = substitute_

                    if substitute == tgt_word:
                        continue  # filter out original word
                    if '##' in substitute:
                        continue  # filter out sub-word

                    if substitute in filter_words:
                        continue
                    '''
                    # filter out atonyms
                    if substitute in w2i and tgt_word in w2i:
                        if cos_mat[w2i[substitute]][w2i[tgt_word]] < 0.4:
                            continue
                    '''
                    temp_replace = copy.deepcopy(final_words)
                    temp_replace[top_index[0]] = substitute
                    available_substitutes.append(substitute)
                    replace_texts.append(' '.join(temp_replace))
                replace_text_input = self.tokenizer(replace_texts, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt').to(device)
                replace_output = net.inference_text(replace_text_input)
                if self.cls:
                    replace_embeds = replace_output['text_embed'][:, 0, :]
                else:
                    replace_embeds = replace_output['text_embed'].flatten(1)

                loss = criterion(replace_embeds.log_softmax(dim=-1), origin_embeds[i].softmax(dim=-1).repeat(len(replace_embeds), 1))
                
                loss = loss.sum(dim=-1)
                candidate_idx = loss.argmax()

                final_words[top_index[0]] = available_substitutes[candidate_idx]

                if available_substitutes[candidate_idx] != tgt_word:
                    change += 1

            final_adverse.append(' '.join(final_words))

        return final_adverse

 
    def _tokenize(self, text):
        words = text.split(' ')

        sub_words = []
        keys = []
        index = 0
        for word in words:
            sub = self.tokenizer.tokenize(word)
            sub_words += sub
            keys.append([index, index + len(sub)])
            index += len(sub)

        return words, sub_words, keys

    def _get_masked(self, text):
        words = text.split(' ')
        len_text = len(words)
        masked_words = []
        for i in range(len_text):
            masked_words.append(words[0:i] + ['[UNK]'] + words[i + 1:])
        # list of words
        return masked_words

    def get_important_scores(self, text, net, origin_embeds, batch_size, max_length):
        device = origin_embeds.device

        masked_words = self._get_masked(text)
        masked_texts = [' '.join(words) for words in masked_words]  # list of text of masked words

        masked_embeds = []
        for i in range(0, len(masked_texts), batch_size):
            masked_text_input = self.tokenizer(masked_texts[i:i+batch_size], padding='max_length', truncation=True, max_length=max_length, return_tensors='pt').to(device)
            masked_output = net.inference_text(masked_text_input)
            if self.cls:
                masked_embed = masked_output['text_feat'][:, 0, :].detach()
            else:
                masked_embed = masked_output['text_feat'].flatten(1).detach()
            masked_embeds.append(masked_embed)
        masked_embeds = torch.cat(masked_embeds, dim=0)

        criterion = torch.nn.KLDivLoss(reduction='none')

        import_scores = criterion(masked_embeds.log_softmax(dim=-1), origin_embeds.softmax(dim=-1).repeat(len(masked_texts), 1))

        return import_scores.sum(dim=-1)



def get_substitues(substitutes, tokenizer, mlm_model, use_bpe, substitutes_score=None, threshold=3.0):
    # substitues L,k
    # from this matrix to recover a word
    words = []
    sub_len, k = substitutes.size()  # sub-len, k

    if sub_len == 0:
        return words

    elif sub_len == 1:
        for (i, j) in zip(substitutes[0], substitutes_score[0]):
            if threshold != 0 and j < threshold:
                break
            words.append(tokenizer._convert_id_to_token(int(i)))
    else:
        if use_bpe == 1:
            words = get_bpe_substitues(substitutes, tokenizer, mlm_model)
        else:
            return words
    #
    # print(words)
    return words


def get_bpe_substitues(substitutes, tokenizer, mlm_model):
    # substitutes L, k
    device = mlm_model.device
    substitutes = substitutes[0:12, 0:4]  # maximum BPE candidates

    # find all possible candidates

    all_substitutes = []
    for i in range(substitutes.size(0)):
        if len(all_substitutes) == 0:
            lev_i = substitutes[i]
            all_substitutes = [[int(c)] for c in lev_i]
        else:
            lev_i = []
            for all_sub in all_substitutes:
                for j in substitutes[i]:
                    lev_i.append(all_sub + [int(j)])
            all_substitutes = lev_i

    # all substitutes  list of list of token-id (all candidates)
    c_loss = nn.CrossEntropyLoss(reduction='none')
    word_list = []
    # all_substitutes = all_substitutes[:24]
    all_substitutes = torch.tensor(all_substitutes)  # [ N, L ]
    all_substitutes = all_substitutes[:24].to(device)
    # print(substitutes.size(), all_substitutes.size())
    N, L = all_substitutes.size()
    word_predictions = mlm_model(all_substitutes)[0]  # N L vocab-size
    ppl = c_loss(word_predictions.view(N * L, -1), all_substitutes.view(-1))  # [ N*L ]
    ppl = torch.exp(torch.mean(ppl.view(N, L), dim=-1))  # N
    _, word_list = torch.sort(ppl)
    word_list = [all_substitutes[i] for i in word_list]
    final_words = []
    for word in word_list:
        tokens = [tokenizer._convert_id_to_token(int(i)) for i in word]
        text = tokenizer.convert_tokens_to_string(tokens)
        final_words.append(text)
    return final_words


def compute_cost_matrix(img_embeds, txt_embeds):
    # 归一化嵌入以计算余弦相似度
    img_embeds_normalized = F.normalize(img_embeds, dim=1)
    txt_embeds_normalized = F.normalize(txt_embeds, dim=1)
    
    # 计算余弦相似度
    cos_similarity = torch.mm(img_embeds_normalized, txt_embeds_normalized.t())
    
    # 将相似度转换为成本
    cost_matrix = 1.0 - cos_similarity

    return cost_matrix

