import copy
import torch
import torch.nn as nn
import numpy as np 
from torchvision import transforms
from PIL import Image
import torch.nn.functional as F
import random
import time


class Attacker():
    def __init__(self, model_inf, model_infi, model_inft, img_attacker, txt_attacker):
        self.model_inf=model_inf
        self.model_infi=model_infi
        self.model_inft=model_inft
        self.img_attacker = img_attacker
        self.txt_attacker = txt_attacker
    
    def attack(self, imgs, txts, txt2img, device='cpu', max_length=30, scales=None, **kwargs):

        with torch.no_grad():
            origin_img_output = self.model_infi(self.img_attacker.normalization(imgs))
            img_supervisions = origin_img_output['image_feat'][txt2img] 
        adv_txts = self.txt_attacker.img_guided_attack1(self.model_inft, txts, img_embeds=img_supervisions)
        
        with torch.no_grad():
            txts_input = self.txt_attacker.tokenizer(txts, padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")#.to(device)
            txts_input.input_ids = txts_input.input_ids.to(device)
            txts_input.attention_mask = txts_input.attention_mask.to(device)
            txts_output = self.model_inft(txts_input.input_ids,txts_input.attention_mask)
            txt_supervisions = txts_output['text_feat']
        adv_imgs, last_adv_imgs = self.img_attacker.txt_guided_attack(self.model_infi, imgs, txt2img, device, 
                                                       scales=scales, txt_embeds = txt_supervisions)
        
                            
        return adv_imgs, adv_txts 
  
class ImageAttacker():
    def __init__(self, normalization, eps=2/255, steps=10, step_size=0.5/255,sample_numbers=5):
        self.normalization = normalization
        self.eps = eps
        self.steps = steps 
        self.step_size = step_size
        self.sample_numbers = sample_numbers

    def loss_func(self, adv_imgs_embeds, txts_embeds, txt2img):  
        device = adv_imgs_embeds.device    

        it_sim_matrix = adv_imgs_embeds @ txts_embeds.T
        it_labels = torch.zeros(it_sim_matrix.shape).to(device)
        
        for i in range(len(txt2img)):
            it_labels[txt2img[i], i]=1
        
        loss_IaTcpos = -(it_sim_matrix * it_labels).sum(-1).mean()
        loss = loss_IaTcpos
        
        return loss
    
    #delta divesity
    def txt_guided_attack(self, model, imgs, txt2img, device, scales=None, txt_embeds=None, txt_embeds0=None):
        
        model.eval()
       
        b, _, w, h = imgs.shape
            
        delta = torch.zeros_like(imgs)
        delta = delta.to(device)

        last_adv_imgs=None
        last_adv_delta=None
        
        start_time = time.time()
        ratio_list = []
        
        # original state and loss bar
        imgs_output = model(self.normalization(imgs))
        imgs_embeds = imgs_output['image_feat']
        loss_bar = self.loss_func(imgs_embeds, txt_embeds, txt2img)
        
        past_deltas = []  # store past deltas
        
        adv_imgs = imgs.detach() + delta
        adv_imgs = torch.clamp(adv_imgs, 0.0, 1.0)
        local_transform = transforms.RandomResizedCrop(w, scale=(0.5, 0.8))
        
        aug_num = 1
        for step in range(self.steps):
            
            delta = delta.clone().detach().requires_grad_(True)
            adv_imgs = imgs.detach() + delta
            
            adv_imgs = torch.clamp(adv_imgs, 0.0, 1.0)
            
            if last_adv_delta!=None:
                samples=[]
                samples1=[]
                
                clone_adv_delta = delta.clone()
                
                #merge diverse gradients
                grad1 = torch.zeros_like(imgs)
                grad1 = grad1.to(device)
                
                #exploration of diverse update directions
                for k in range(5): 
                    # 1) Positive-only
                    coeffs_pos = torch.rand(step, device=device)
                    coeffs_pos = coeffs_pos / coeffs_pos.sum()
                    delta0 = sum(coeffs_pos[t] * past_deltas[t] for t in range(step))
                    
                    # 2) With negatives
                    if step <2:
                        coeffs_neg = torch.tensor([1.0], device=device)
                    else:
                        coeffs_neg = torch.randn(step, device=device)
                        while (coeffs_neg > 0).all() or (coeffs_neg < 0).all() or coeffs_neg.sum().abs() < 1e-6:
                            coeffs_neg = torch.randn(step, device=device) 
                        coeffs_neg /= coeffs_neg.sum()
                    delta1 = sum(coeffs_neg[t] * past_deltas[t] for t in range(step))
                    
                    delta0 = torch.clamp(delta0, -self.eps, self.eps)
                    delta1 = torch.clamp(delta1, -self.eps, self.eps)
                    
                    adv_imgs0=imgs.detach() + delta0
                    adv_imgs1=imgs.detach() + delta1
                    adv_imgs0 = torch.clamp(adv_imgs0, 0.0, 1.0)
                    adv_imgs1 = torch.clamp(adv_imgs1, 0.0, 1.0)

                    adv_imgs_output0 = model(self.normalization(adv_imgs0))
                    adv_imgs_output1 = model(self.normalization(adv_imgs1))

                    adv_imgs_embeds0 = adv_imgs_output0['image_feat']
                    adv_imgs_embeds1 = adv_imgs_output1['image_feat']
                            
                    with torch.enable_grad():
                      loss0 = self.loss_func(adv_imgs_embeds0, txt_embeds, txt2img)
                      loss1 = self.loss_func(adv_imgs_embeds1, txt_embeds, txt2img)

                    if loss0.item() > loss_bar:
                        max_loss0 = loss_bar.item()
                        best_adv_imgs0 = adv_imgs0.detach()
                        best_delta0 = delta0.detach()

                        delta_ = best_delta0.clone().detach().requires_grad_(True)
                        adv_imgs = torch.clamp(imgs + delta_, 0.0, 1.0)   
                        
                        scaled_imgs = self.get_crop_imgs(adv_imgs, se_num, device)
                        adv_imgs_output = model(self.normalization(scaled_imgs))
                        adv_imgs_embeds = adv_imgs_output['image_feat']
                        
                        with torch.enable_grad():
                            loss = torch.tensor(0.0, dtype=torch.float32).to(device)
                            for i in range(se_num): 
                                loss_item = self.loss_func(adv_imgs_embeds[i*b:i*b+b], txt_embeds, txt2img)
                                loss += loss_item
                        grad = torch.autograd.grad(loss, [delta_])[0]
                        grad = grad / torch.mean(torch.abs(grad), dim=(1,2,3), keepdim=True)
                        grad1 = grad + grad1
                        
                        
                    if loss1.item() > loss_bar:
                        max_loss1 = loss1.item()
                        best_adv_imgs1 = adv_imgs1.detach()
                        best_delta1 = delta1.detach()
                        
                        delta_ = best_delta1.clone().detach().requires_grad_(True)
                        adv_imgs = torch.clamp(imgs + delta_, 0.0, 1.0)
                        scaled_imgs = self.get_crop_imgs(adv_imgs, se_num, device)
                        
                        adv_imgs_output = model(self.normalization(scaled_imgs))
                        adv_imgs_embeds = adv_imgs_output['image_feat']
                        
                        model.zero_grad()
                        with torch.enable_grad():
                            loss = torch.tensor(0.0, dtype=torch.float32).to(device)
                            for i in range(se_num): 
                                loss_item = self.loss_func(adv_imgs_embeds[i*b:i*b+b], txt_embeds, txt2img)
                                loss += loss_item
                        grad = torch.autograd.grad(loss, [delta_])[0]
                        grad = grad / torch.mean(torch.abs(grad), dim=(1,2,3), keepdim=True)
                        grad1 = grad + grad1

                #gradient of original data
                loss = torch.tensor(0.0, dtype=torch.float32).to(device)
                    
                delta_ = delta.clone().detach().requires_grad_(True)
                adv_imgs = adv_imgs.detach() + delta_
                adv_imgs = torch.clamp(adv_imgs, 0.0, 1.0)  
                scaled_imgs = self.get_crop_imgs(adv_imgs, se_num, device)
                
                adv_imgs_output = model(self.normalization(scaled_imgs))
                adv_imgs_embeds = adv_imgs_output['image_feat']
                    
                with torch.enable_grad():
                     loss = torch.tensor(0.0, dtype=torch.float32).to(device)
                     for i in range(se_num): 
                         loss_item = self.loss_func(adv_imgs_embeds[i*b:i*b+b], txt_embeds, txt2img)
                         loss += loss_item
                grad = torch.autograd.grad(loss, [delta_])[0]
                grad = grad / torch.mean(torch.abs(grad), dim=(1,2,3), keepdim=True)
                grad1 = grad + grad1
                grad = grad1
          
                # average the gradients
                with torch.no_grad():
                    delta = clone_adv_delta + self.step_size * grad.sign()
                    delta = torch.clamp(delta, -self.eps, self.eps)
                
                adv_imgs = imgs.detach() + delta
                adv_imgs = torch.clamp(adv_imgs, 0.0, 1.0)
                last_adv_delta = clone_adv_delta.clone()
                
                # append the updated delta to past_deltas
                past_deltas.append(delta.clone().detach())
                
            else:
                last_adv_delta = delta.clone()
                   
                scaled_imgs = self.get_crop_imgs(adv_imgs, se_num, device)
                adv_imgs_output = model(self.normalization(scaled_imgs))
                adv_imgs_embeds = adv_imgs_output['image_feat']
                
                with torch.enable_grad():
                    loss = self.loss_func(adv_imgs_embeds, txt_embeds, txt2img)
                
                grad = torch.autograd.grad(loss, [delta])[0]
                grad = grad / torch.mean(torch.abs(grad), dim=(1,2,3), keepdim=True)           

                with torch.no_grad():
                    delta = delta + self.step_size * grad.sign()
                    delta = torch.clamp(delta, -self.eps, self.eps)
                adv_imgs = imgs.detach() + delta
                adv_imgs = torch.clamp(adv_imgs, 0.0, 1.0)
                past_deltas.append(delta.clone().detach())
        
        end_time = time.time()

        elapsed_time = end_time - start_time
        print(f"The function execution time: {elapsed_time} seconds")

        return adv_imgs

    def get_crop_imgs(self, imgs, num=None, device='cuda'):
        if num is None:
            return imgs

        ori_shape = (imgs.shape[-2], imgs.shape[-1])
        
        local_transform = transforms.RandomResizedCrop(ori_shape, scale=(0.5, 0.8))
        
        #reverse_transform = transforms.Resize(ori_shape,
        #                        interpolation=transforms.InterpolationMode.BICUBIC)
        result = []
        for i in range(num):

            reversed_imgs = local_transform(imgs)
            
            result.append(reversed_imgs)
        
        return torch.cat([imgs,]+result, 0)



filter_words = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost',
                'alone', 'along', 'already', 'also', 'although', 'am', 'among', 'amongst', 'an', 'and', 'another',
                'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', "aren't", 'around', 'as',
                'at', 'back', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides',
                'between', 'beyond', 'both', 'but', 'by', 'can', 'cannot', 'could', 'couldn', "couldn't", 'd', 'didn',
                "didn't", 'doesn', "doesn't", 'don', "don't", 'down', 'due', 'during', 'either', 'else', 'elsewhere',
                'empty', 'enough', 'even', 'ever', 'everyone', 'everything', 'everywhere', 'except', 'first', 'for',
                'former', 'formerly', 'from', 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'he', 'hence',
                'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his',
                'how', 'however', 'hundred', 'i', 'if', 'in', 'indeed', 'into', 'is', 'isn', "isn't", 'it', "it's",
                'its', 'itself', 'just', 'latter', 'latterly', 'least', 'll', 'may', 'me', 'meanwhile', 'mightn',
                "mightn't", 'mine', 'more', 'moreover', 'most', 'mostly', 'must', 'mustn', "mustn't", 'my', 'myself',
                'namely', 'needn', "needn't", 'neither', 'never', 'nevertheless', 'next', 'no', 'nobody', 'none',
                'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'on', 'once', 'one', 'only',
                'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'per',
                'please', 's', 'same', 'shan', "shan't", 'she', "she's", "should've", 'shouldn', "shouldn't", 'somehow',
                'something', 'sometime', 'somewhere', 'such', 't', 'than', 'that', "that'll", 'the', 'their', 'theirs',
                'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein',
                'thereupon', 'these', 'they', 'this', 'those', 'through', 'throughout', 'thru', 'thus', 'to', 'too',
                'toward', 'towards', 'under', 'unless', 'until', 'up', 'upon', 'used', 've', 'was', 'wasn', "wasn't",
                'we', 'were', 'weren', "weren't", 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
                'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while',
                'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'with', 'within', 'without', 'won',
                "won't", 'would', 'wouldn', "wouldn't", 'y', 'yet', 'you', "you'd", "you'll", "you're", "you've",
                'your', 'yours', 'yourself', 'yourselves', '.', '-', 'a the', '/', '?', 'some', '"', ',', 'b', '&', '!',
                '@', '%', '^', '*', '(', ')', "-", '-', '+', '=', '<', '>', '|', ':', ";", '～', '·']
filter_words = set(filter_words)
    

class TextAttacker():
    def __init__(self, ref_net, tokenizer, cls=True, max_length=30, number_perturbation=1, topk=10, threshold_pred_score=0.3, batch_size=32,text_ratios=[0.6,0.2,0.2]):
        self.ref_net = ref_net
        self.tokenizer = tokenizer
        self.max_length = max_length
        # epsilon_txt
        self.num_perturbation = number_perturbation
        self.threshold_pred_score = threshold_pred_score
        self.topk = topk
        self.batch_size = batch_size
        self.cls = cls
        self.text_ratios = text_ratios

    def img_guided_attack1(self, net, texts, img_embeds = None, adv_img_embeds = None,last_adv_img_embeds=None,last_adv_img_embeds1=None):
        device = img_embeds.device

        text_inputs = self.tokenizer(texts, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        text_inputs.input_ids = text_inputs.input_ids.to(device)
        text_inputs.attention_mask = text_inputs.attention_mask.to(device)
        text_inputs.token_type_ids = text_inputs.token_type_ids.to(device)
        
        # substitutes
        mlm_logits = self.ref_net.module(text_inputs.input_ids, attention_mask=text_inputs.attention_mask).logits
        word_pred_scores_all, word_predictions = torch.topk(mlm_logits, self.topk, -1)  # seq-len k

        # original state
        origin_output = net(text_inputs.input_ids, text_inputs.attention_mask)
        if self.cls:
            origin_embeds = origin_output['text_feat'][:, 0, :].detach()
        else:
            origin_embeds = origin_output['text_feat'].flatten(1).detach()

        final_adverse = []
        
        for i, text in enumerate(texts):
            # word importance eval
            important_scores = self.get_important_scores(text, net, origin_embeds[i], self.batch_size, self.max_length)
            
            top_k = 5  #  set how many important words to consider
            list_of_index = [idx for idx, _ in sorted(enumerate(important_scores), key=lambda x:x[1], reverse=True)][:top_k]

            words, sub_words, keys = self._tokenize(text)
            final_words = copy.deepcopy(words)
            change = 0
            
            
            mapping = []  # Keep track: (word_idx, substitute)
            replace_texts = [' '.join(final_words)]
            
            for word_idx in list_of_index:
                if change >= self.num_perturbation:
                    break

                tgt_word = words[word_idx]
                if tgt_word in filter_words:
                    continue
                if keys[word_idx][0] > self.max_length - 2:
                    continue

                substitutes = word_predictions[i, keys[word_idx][0]:keys[word_idx][1]]  # L, k
                word_pred_scores = word_pred_scores_all[i, keys[word_idx][0]:keys[word_idx][1]]

                substitutes = get_substitues(substitutes, self.tokenizer, self.ref_net, 1, word_pred_scores,
                                             self.threshold_pred_score)
                
                available_substitutes = [tgt_word]
                for substitute_ in substitutes:
                    substitute = substitute_

                    if substitute == tgt_word:
                        continue  # filter out original word
                    if '##' in substitute:
                        continue  # filter out sub-word

                    if substitute in filter_words:
                        continue
                        
                    temp_replace = copy.deepcopy(final_words)
                    temp_replace[word_idx] = substitute
                    available_substitutes.append(substitute)
                    replace_texts.append(' '.join(temp_replace))
                    mapping.append((word_idx, substitute))
                    
            if not replace_texts:
                break  # no valid substitutions left
                        
            replace_text_input = self.tokenizer(replace_texts, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')#.to(device)
            replace_text_input.input_ids = replace_text_input.input_ids.to(device)
            replace_text_input.attention_mask = replace_text_input.attention_mask.to(device)
            replace_text_input.token_type_ids = replace_text_input.token_type_ids.to(device)
                
            replace_output = net(replace_text_input.input_ids, replace_text_input.attention_mask)
                
            if self.cls:
                replace_embeds = replace_output['text_feat'][:, 0, :]
            else:
                replace_embeds = replace_output['text_feat'].flatten(1)

            
            text_ratios1 = [0.4, 0.2,0.2,0.2]
            
            if adv_img_embeds==None:
                loss = self.loss_func(replace_embeds, img_embeds, i)
            else: 
                loss = text_ratios1[0]*self.loss_func(replace_embeds,img_embeds,i) + text_ratios1[1]*self.loss_func(replace_embeds,adv_img_embeds,i) + text_ratios1[2]*self.loss_func(replace_embeds,last_adv_img_embeds,i) + text_ratios1[2]*self.loss_func(replace_embeds,last_adv_img_embeds1,i)
            
            #Apply the best available substitution (skip if original word)
            sorted_indices = loss.argsort(descending=True)
           
            applied = False
            for idx in sorted_indices:
                if idx==0:
                    break
                
                word_idx, substitute = mapping[idx-1]
                
                if final_words[word_idx] != substitute:
                    final_words[word_idx] = substitute
                    change += 1
                    applied = True
                    break            
            final_adverse.append(' '.join(final_words))

        return final_adverse
    
    def loss_func(self, txt_embeds, img_embeds, label):
        loss_TaIcpos = -txt_embeds.mul(img_embeds[label].repeat(len(txt_embeds), 1)).sum(-1) 
        loss = loss_TaIcpos
        return loss

    def _tokenize(self, text):
        words = text.split(' ')

        sub_words = []
        keys = []
        index = 0
        for word in words:
            sub = self.tokenizer.tokenize(word)
            sub_words += sub
            keys.append([index, index + len(sub)])
            index += len(sub)

        return words, sub_words, keys

    def _get_masked(self, text):
        words = text.split(' ')
        len_text = len(words)
        masked_words = []
        for i in range(len_text):
            masked_words.append(words[0:i] + ['[UNK]'] + words[i + 1:])
        # list of words
        return masked_words

    def get_important_scores(self, text, net, origin_embeds, batch_size, max_length):
        device = origin_embeds.device

        masked_words = self._get_masked(text)
        masked_texts = [' '.join(words) for words in masked_words]  # list of text of masked words

        masked_embeds = []
        for i in range(0, len(masked_texts), batch_size):
            masked_text_input = self.tokenizer(masked_texts[i:i+batch_size], padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')#.to(device)
            
            #masked_output = net.inference_text(masked_text_input)
            masked_text_input.input_ids = masked_text_input.input_ids.to(device)
            masked_text_input.attention_mask = masked_text_input.attention_mask.to(device)
            masked_output = net(masked_text_input.input_ids, masked_text_input.attention_mask)
            
            if self.cls:
                masked_embed = masked_output['text_feat'][:, 0, :].detach()
            else:
                masked_embed = masked_output['text_feat'].flatten(1).detach()
            masked_embeds.append(masked_embed)
        masked_embeds = torch.cat(masked_embeds, dim=0)

        criterion = torch.nn.KLDivLoss(reduction='none')

        import_scores = criterion(masked_embeds.log_softmax(dim=-1), origin_embeds.softmax(dim=-1).repeat(len(masked_texts), 1))

        return import_scores.sum(dim=-1)

def get_substitues(substitutes, tokenizer, mlm_model, use_bpe, substitutes_score=None, threshold=3.0):
    # substitues L,k
    # from this matrix to recover a word
    words = []
    sub_len, k = substitutes.size()  # sub-len, k

    if sub_len == 0:
        return words

    elif sub_len == 1:
        for (i, j) in zip(substitutes[0], substitutes_score[0]):
            if threshold != 0 and j < threshold:
                break
            words.append(tokenizer._convert_id_to_token(int(i)))
    else:
        if use_bpe == 1:
            words = get_bpe_substitues(substitutes, tokenizer, mlm_model)
        else:
            return words
    return words


def get_bpe_substitues(substitutes, tokenizer, mlm_model):
    # substitutes L, k
    try:
        device = mlm_model.module.device
    except:
        device = mlm_model.device
    substitutes = substitutes[0:12, 0:4]  # maximum BPE candidates

    # find all possible candidates

    all_substitutes = []
    for i in range(substitutes.size(0)):
        if len(all_substitutes) == 0:
            lev_i = substitutes[i]
            all_substitutes = [[int(c)] for c in lev_i]
        else:
            lev_i = []
            for all_sub in all_substitutes:
                for j in substitutes[i]:
                    lev_i.append(all_sub + [int(j)])
            all_substitutes = lev_i

    # all substitutes  list of list of token-id (all candidates)
    c_loss = nn.CrossEntropyLoss(reduction='none')
    word_list = []
    all_substitutes = torch.tensor(all_substitutes)  # [ N, L ]
    all_substitutes = all_substitutes[:24].to(device)
    N, L = all_substitutes.size()
    try:
        word_predictions = mlm_model(all_substitutes)[0]
    except:
        word_predictions = mlm_model.module(all_substitutes)[0]
    ppl = c_loss(word_predictions.view(N * L, -1), all_substitutes.view(-1))  # [ N*L ]
    ppl = torch.exp(torch.mean(ppl.view(N, L), dim=-1))  # N
    _, word_list = torch.sort(ppl)
    word_list = [all_substitutes[i] for i in word_list]
    final_words = []
    for word in word_list:
        tokens = [tokenizer._convert_id_to_token(int(i)) for i in word]
        text = tokenizer.convert_tokens_to_string(tokens)
        final_words.append(text)
    return final_words