
import inspect
import warnings
from typing import List, Optional, Union
import itertools

import numpy as np
import torch
import torch.nn.functional as F
import copy

import PIL
from accelerate import Accelerator
from diffusers.models import AutoencoderKL, UNet2DConditionModel
from diffusers.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from diffusers.utils import deprecate, logging


from packaging import version
from tqdm.auto import tqdm
from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer,BertTokenizer, BertModel


if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
    PIL_INTERPOLATION = {
        "linear": PIL.Image.Resampling.BILINEAR,
        "bilinear": PIL.Image.Resampling.BILINEAR,
        "bicubic": PIL.Image.Resampling.BICUBIC,
        "lanczos": PIL.Image.Resampling.LANCZOS,
        "nearest": PIL.Image.Resampling.NEAREST,
    }
else:
    PIL_INTERPOLATION = {
        "linear": PIL.Image.LINEAR,
        "bilinear": PIL.Image.BILINEAR,
        "bicubic": PIL.Image.BICUBIC,
        "lanczos": PIL.Image.LANCZOS,
        "nearest": PIL.Image.NEAREST,
    }
# ------------------------------------------------------------------------------

logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


def preprocess(image):
    w, h = image.size
    w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
    image = image.resize((w, h), resample=PIL_INTERPOLATION["lanczos"])
    image = np.array(image).astype(np.float32) / 255.0
    image = image[None].transpose(0, 3, 1, 2)
    image = torch.from_numpy(image)
    return 2.0 * image - 1.0


class DreamBoothForgeditStableDiffusionPipeline(DiffusionPipeline):
   

    def __init__(
        self,
        vae: AutoencoderKL,
        text_encoder: CLIPTextModel,
        tokenizer: CLIPTokenizer,
        unet: UNet2DConditionModel,
        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
    ):
        super().__init__()
        self.register_modules(
            vae=vae,
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            unet=unet,
            scheduler=scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
        )
        

    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
        r"""
        Enable sliced attention computation.
        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
        in several steps. This is useful to save some memory in exchange for a small speed decrease.
        Args:
            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
                `attention_head_dim` must be a multiple of `slice_size`.
        """
        if slice_size == "auto":
            # half the attention head size is usually a good trade-off between
            # speed and memory
            slice_size = self.unet.config.attention_head_dim // 2
        self.unet.set_attention_slice(slice_size)

    def disable_attention_slicing(self):
        r"""
        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
        back to computing attention in one step.
        """
        # set slice_size = `None` to disable `attention slicing`
        self.enable_attention_slicing(None)
    
        
    def train(
        self,
        source:Union[str, List[str]],
        prompt: Union[str, List[str]],
        image: Union[torch.FloatTensor, PIL.Image.Image],
        unet_orig=None,
        height: Optional[int] = 512,
        width: Optional[int] = 512,
        bsz=1,
        interpolation='vs',
        generator: Optional[torch.Generator] = None,
        embedding_learning_rate: float = 1e-3,#0.001,
        diffusion_model_learning_rate: float = 2e-5,
        memory_learning_rate=2e-5,
        text_embedding_optimization_steps: int = 500,
        model_fine_tuning_optimization_steps: int = 1000,
        **kwargs,
    ):
        
        
        message = "Please use `image` instead of `init_image`."
        

        accelerator = Accelerator(
            gradient_accumulation_steps=1,
            mixed_precision="fp16",
        )

        if "torch_device" in kwargs:
            device = kwargs.pop("torch_device")
            warnings.warn(
                "`torch_device` is deprecated as an input argument to `__call__` and will be removed in v0.3.0."
                " Consider using `pipe.to(torch_device)` instead."
            )

            if device is None:
                device = "cuda" if torch.cuda.is_available() else "cpu"
            self.to(device)

        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

        
        self.vae.requires_grad_(False)
        self.unet.requires_grad_(True)
        self.text_encoder.requires_grad_(True)
        self.unet.train()
        self.vae.eval()
        self.text_encoder.train()
        

        if accelerator.is_main_process:
            accelerator.init_trackers(
                "imagic",
                config={
                    "embedding_learning_rate": embedding_learning_rate,
                    "text_embedding_optimization_steps": text_embedding_optimization_steps,
                },
            )

        # get text embeddings for prompt
        text_input = self.tokenizer(
            source,
            
            padding="max_length",
            max_length=self.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        
        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
        
        
        
        
        
        
        
        self.unet.train()
        params_to_optimize =[]
        
        for name, params in self.unet.named_parameters():
            
            params.requires_grad=True
            
            
            if params.requires_grad==True:
                params_to_optimize.append(params)
                print(name, " with graidient")
        
        params_to_optimize_textencoder =[]
        for name, params in self.text_encoder.named_parameters():
            params.requires_grad=True#False
            if params.requires_grad==True:
                params_to_optimize_textencoder.append(params)
                print(name, " with graidient")
        params_textencoder_unet=  (
            itertools.chain([params_to_optimize_textencoder,params_to_optimize   ] ) 
        )
        optimizer = torch.optim.Adam([
            {'params':params_to_optimize,  
            'lr':diffusion_model_learning_rate,},
            {'params':params_to_optimize_textencoder,'lr':embedding_learning_rate},
            
        ])
        progress_bar = tqdm(range(model_fine_tuning_optimization_steps//bsz), disable=not accelerator.is_local_main_process)
        
        

        if isinstance(image, PIL.Image.Image):
            image = preprocess(image)
        
        latents_dtype = text_embeddings.dtype
        image = image.to(device=self.device, dtype=latents_dtype)
        init_latent_image_dist = self.vae.encode(image).latent_dist
        image_latents = init_latent_image_dist.sample(generator=generator)
        image_latents = 0.18215 * image_latents
        

        global_step = 0
        self.unet_list=[]

        logger.info("First optimizing the text embedding to better reconstruct the init image")
        for i in range(text_embedding_optimization_steps//bsz):
            with accelerator.accumulate(params_textencoder_unet):#text_embeddings):
                # Sample noise that we'll add to the latents
                image_latents_batch=image_latents.repeat(bsz,1,1,1)
                
                noise = torch.randn(image_latents_batch.shape).to(image_latents.device)
                text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
                text_embeddings_batch=text_embeddings.repeat(bsz,1,1)
                
                timesteps = torch.randint(1000, (bsz,), device=image_latents.device)
                

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
                noisy_latents = self.scheduler.add_noise(image_latents_batch, noise, timesteps)

                # Predict the noise residual
                noise_pred = self.unet(noisy_latents, timesteps, text_embeddings_batch).sample

                loss = F.mse_loss(noise_pred, noise, reduction="none").mean([1, 2, 3]).mean()
                
                if False:#loss.item()<0.1 and i*bsz>350:
                    print('final loss is ',loss.item())
                    break
                accelerator.backward(loss)

                optimizer.step()
                optimizer.zero_grad()
                

            # Checks if the accelerator has performed an optimization step behind the scenes
            if accelerator.sync_gradients:
                progress_bar.update(1)
                global_step += 1

            logs = {"loss": loss.detach().item()}  # , "lr": lr_scheduler.get_last_lr()[0]}
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)
            

        accelerator.wait_for_everyone()
        
        
        
        for name, params in self.unet.named_parameters():
            params.requires_grad=False
        for name, params in self.text_encoder.named_parameters():
            params.requires_grad=False
        
        
        self.unet.eval()
        self.text_encoder.eval()
        
        
        
        torch.cuda.empty_cache()
    
    @torch.no_grad()
    def __call__(
        self,
        source='',
        prompt='',
        unet_orig=None,
        alpha: float = 1.2,
        height: Optional[int] = 512,
        width: Optional[int] = 512,
        num_inference_steps: Optional[int] = 50,
        generator: Optional[torch.Generator] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        guidance_scale: float = 7.5,
        eta: float = 0.0,
        ForgetDegree=0.0,
        interpolation='vs',
        freeze='',
        textalpha=0,
        **kwargs,
    ):
        
        
        self.unet.eval()
        self.unet_copy=copy.deepcopy(self.unet)
        
        
        for u,u_orig in zip(self.unet_copy.named_parameters(),unet_orig.named_parameters()):
            name,params=u
            name_orig,params_orig=u_orig
            
            params.requires_grad = False
            ForgetDegree=0
            if freeze=='noencoder':
                if 'down_blocks' in name :
                    
                    params.data=ForgetDegree*params.data+(1-ForgetDegree)*params_orig.data
            elif freeze=='nodecoder':
                if 'up_blocks' in name :
                    
                    params.data=ForgetDegree*params.data+(1-ForgetDegree)*params_orig.data
            elif freeze=='encoderattn+encoder1':
                if 'down_blocks' in name and not ( 'attn' in name  or 'down_blocks.1' in name):
                    
                    params.data=ForgetDegree*params.data+(1-ForgetDegree)*params_orig.data
            
            elif freeze=='encoderkv':
                if 'down_blocks' in name and ('attn2.to_v' in name or 'attn2.to_k' in name):
                
                    
                    params.data=ForgetDegree*params.data+(1-ForgetDegree)*params_orig.data
            elif freeze=='encoderattn':
                if 'down_blocks' in name and not ( 'attn' in name):
                
                    
                    params.data=ForgetDegree*params.data+(1-ForgetDegree)*params_orig.data
            elif freeze=='decoderattn':
                if ( 'up_blocks' in name) and not ( 'attn' in name):
                    
                    params.data=ForgetDegree*params.data+(1-ForgetDegree)*params_orig.data
            
            
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
        
        
        text_input = self.tokenizer(
            
            source,
            padding="max_length",
            max_length=self.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        text_embeddings = self.text_encoder(text_input.input_ids.to(self.device))[0]
        
        prompt_input = self.tokenizer(
            prompt,
            padding="max_length",
            max_length=self.tokenizer.model_max_length,
            truncation=True,
            return_tensors="pt",
        )
        prompt_embeddings = self.text_encoder(prompt_input.input_ids.to(self.device))[0]
        if interpolation=='vs':
            text_embeddings=alpha*prompt_embeddings+(1-alpha)*text_embeddings#
        elif interpolation=='vp':
            
            normalizetext=torch.nn.functional.normalize(text_embeddings,dim=2)
            b,n,c=normalizetext.shape
            normtext=normalizetext.view(n,c,1)
            viewprompt=prompt_embeddings.view(n,1,c)
            projtext=torch.matmul(viewprompt,normtext)
            projtext=projtext*normtext
            projtext=projtext.view(1,n,c)
            projedit=prompt_embeddings-projtext
            text_embeddings =alpha*projedit+textalpha*text_embeddings#projtext#text_embeddings
            
        
        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0
        # get unconditional embeddings for classifier free guidance
        if do_classifier_free_guidance:
            uncond_tokens = [""]
            max_length = self.tokenizer.model_max_length
            uncond_input = self.tokenizer(
                uncond_tokens,
                padding="max_length",
                max_length=max_length,
                truncation=True,
                return_tensors="pt",
            )
            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]

            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
            seq_len = uncond_embeddings.shape[1]
            uncond_embeddings = uncond_embeddings.view(1, seq_len, -1)

            # For classifier free guidance, we need to do two forward passes.
            # Here we concatenate the unconditional and text embeddings into a single batch
            # to avoid doing two forward passes
            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])

        # get the initial random noise unless the user supplied it

        # Unlike in other pipelines, latents need to be generated in the target device
        # for 1-to-1 results reproducibility with the CompVis implementation.
        # However this currently doesn't work in `mps`.
        latents_shape = (1, self.unet.in_channels, height // 8, width // 8)
        latents_dtype = text_embeddings.dtype
        
        if self.device.type == "mps":
            # randn does not exist on mps
            latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to(
                self.device
            )
        else:
            latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
        
        # set timesteps
        self.scheduler.set_timesteps(num_inference_steps)

        # Some schedulers like PNDM have timesteps as arrays
        # It's more optimized to move all timesteps to correct device beforehand
        timesteps_tensor = self.scheduler.timesteps.to(self.device)

        # scale the initial noise by the standard deviation required by the scheduler
        latents = latents * self.scheduler.init_noise_sigma

        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
        # and should be between [0, 1]
        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
        extra_step_kwargs = {}
        if accepts_eta:
            extra_step_kwargs["eta"] = eta
            
        
        #for unet in self.unet_list:
        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
            # expand the latents if we are doing classifier free guidance
            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

            # predict the noise residual
            noise_pred = self.unet_copy(latent_model_input, t, encoder_hidden_states=text_embeddings).sample

            # perform guidance
            if do_classifier_free_guidance:
                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

            # compute the previous noisy sample x_t -> x_t-1
            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

        latents = 1 / 0.18215 * latents
        image = self.vae.decode(latents).sample

        image = (image / 2 + 0.5).clamp(0, 1)

        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
        image = image.cpu().permute(0, 2, 3, 1).float().numpy()

        if self.safety_checker is not None:
            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(
                self.device
            )
            image, has_nsfw_concept = self.safety_checker(
                images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
            )
        else:
            has_nsfw_concept = None

        if output_type == "pil":
            image = self.numpy_to_pil(image)

        if not return_dict:
            return (image, has_nsfw_concept)

        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)