### Preamble ##########################################################################################################

"""
A GController module that implements adversarial classifier guidance for the StableDiffusion model.
"""

#######################################################################################################################

### Imports ###########################################################################################################

import torch
from torchvision.transforms.v2 import TenCrop, Grayscale
from torchvision.transforms.v2.functional import rotate
from torchvision.transforms import Compose, CenterCrop, Normalize, Resize
from typing import Union, Iterable, Optional, Tuple, Callable

from diffusers import DiffusionPipeline
from diffusers.models import AutoencoderKL
from diffusers.configuration_utils import register_to_config

from transformers import PreTrainedModel

from ..controller_utils import GController

from gcontrol.utils.im_utils import array_to_PIL

#######################################################################################################################


class AdversarialClassifierGuidance(GController):

    def __init__(
        self,
        classifier: Union[PreTrainedModel, torch.nn.Module],
        do_resize: bool,
        do_rescale: bool,
        do_normalize: bool,
        size: Union[dict, tuple[int, int, int], tuple[int, int], int],
        crop_pct: Optional[float] = None,
        rescale_factor: Optional[float] = None,
        image_mean: Optional[Union[list[float], torch.Tensor]] = None,
        image_std: Optional[Union[list[float], torch.Tensor]] = None,
        **kwargs,
    ):
        """
        :param classifier: PreTrainedModel or torch.nn.Module
            The classification model used to guide the diffusion process. Should return either a `torch.Tensor` of
            logits, or a object with a `logits` attribute.
        :param do_resize:  bool
            Whether the diffusion latents should be resized prior to being passed to the classifier.
        :param do_rescale: bool
            Whether the diffusion latents should be rescaled prior to being passed to the classifier.
        :param do_normalize: bool
            Whether to normalise the image prior to classification.
        :param size: int, (int, int), (int, int, int)
            An integer or tuple of integers with shape (C, H, W) or (H, W) that denotes the height and width that the
            diffusion image will be resized to when passing to the classifier. Note a single integer will cause images
            to be resized with equal height and width.
        :param crop_pct: float
            Determines whether the image will be resized and then cropped to preserve aspect ratio. `crop_pct` is the
            percentage of the resized image that won't be cropped. If an image is to be resized to `size = (200, 200)`
            and `crop_pct = 0.8`, then the image will be resized to (250, 250) (i.e., 200 / 0.8) and then cropped to
            (200, 200).
        :param rescale_factor: float
            The scale factor to be applied to the image post resizing, but prior to normalisation.
        :param image_mean: list or torch.Tensor
            The image mean to be used in normalisation.
        :param image_std: list or torch.Tensor
            The image standard deviation to be used in normalisation.

        Returns a `gcontrol` adversarial classifier guidance module for use in the stable1 pipeline.
        """

        # For compatibility with transformers preprocess dict
        for attr in ["_processor_class", "crop_pct", "resample", "image_processor_type"]:
            if attr in kwargs:
                _ = kwargs.pop(attr)

        super().__init__(**kwargs)

        if isinstance(size, dict):
            if "shortest_edge" in size:
                size = (size["shortest_edge"], size["shortest_edge"])
            else:
                raise ValueError("`size` dict must contain key `shortest_edge`")
        elif isinstance(size, int):
            size = (size, size)
        elif isinstance(size, tuple):
            if len(size) == 2:
                size = size
            elif len(size) == 3:
                size = size[1:]
            else:
                raise ValueError(f"`size` expected tuple of length 2 or 3, got length {len(size)}")
        else:
            raise TypeError("Got unsupported `size` type")

        if isinstance(image_mean, torch.Tensor):
            image_mean = image_mean.tolist()

        if isinstance(image_std, torch.Tensor):
            image_std = image_std.tolist()

        self.register_module("classifier", classifier)

        self.register_to_config(
            crop_pct=crop_pct,
            do_resize=do_resize,
            size=size,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
        )

        # Initialising preprocessor
        transforms = []
        if self.config.crop_pct is not None and self.config.do_resize:  # Resize and crop preserving aspect ratio
            resize_size = int(round(min(self.config.size) / self.config.crop_pct))
            transforms.append(Resize(resize_size))
            transforms.append(CenterCrop(self.config.size))
        elif self.config.do_resize:  # Resize violating aspect ratio
            transforms.append(Resize(self.config.size))

        if self.config.do_rescale:  # Rescale, typically dividing by max pixel value (255)
            transforms.append(Normalize(0, 1 / self.config.rescale_factor))

        if self.config.do_normalize:  # Normalize with some mean and std
            transforms.append(Normalize(self.config.image_mean, self.config.image_std))

        if len(transforms) > 0:
            self.preprocessor = Compose(transforms=transforms)
        else:
            self.preprocessor = None

        self._requires_uncond_noise = True
        self._requires_latent_grad = True

    def forward(
        self,
        _pipeline: DiffusionPipeline,
        _unconditional_noise: torch.Tensor,
        _conditional_noise: torch.Tensor,
        _gconditional_noise: torch.Tensor,
        _latents: torch.Tensor,
        _t,
        _extra_step_kwargs,
        target_idx: Union[int, list[int]] = 0,
        g_w: float = 5,
        g_p: float = 2,
        g_m: Optional[float] = 0.5,
        g_s: float = 5,
        classifier_guidance_bounds: Tuple[int, int] = (0, 1000),
        grad_norm: int = 2,
        augmentations: Optional[Union[list[Callable], Callable, str]] = None,
        grad_zero_threshold: Optional[float] = None,
        **kwargs,
    ) -> torch.Tensor:
        """
        :param g_w: float
            Equivalent to the guidance scale used in the classifier-free guidance scheme. How much the diffusion
            follows classifier-free guidance.
        :param g_p: float
            Equivalent to the guidance scale used in the classifier-free guidance scheme. A larger value adds stronger
            adversarial features at the cost of image diversity.
        :param g_m: float
            The mixing scale for classifier-free adversarial guidance. A higher value incorporates more information
            from the classifier-free adversarial diffusion. If `None`, then only adversarial classifier guidance is
            used.
        :param g_s: float
            Equivalent to the guidance scale used in the classifier guidance scheme. How much the diffusion follows
            the adversarial classifier guidance.
        :param time_travel_bounds: Tuple[int, int]
            Bounds of `t` that adversarial classifier guidance  will be performed in.
        :param grad_norm:  int | None
            The normalisation applied to the classifier gradient. A value of None is no normalisation, and other value
            is the l_{grad_norm} norm.
        :param augmentations: list of functions, function, or str
            Optional augmentation(s) to be applied to the images prior to classification. The classification
            logits are averaged across the predictions for each transformation and the unaugmented image. If `None`,
            then no transformations are applied. If an augmentation changes the shape of the image, then it will be
            resized using `torchvision.transforms.Resize`. Pass "recommended" to apply the recommended crop,
            rotations, and greyscale transforms. Pass `None` for no transforms.
        :grad_zero_threshold float | None
            Optional threshold to zero the gradient. This can help prevent underflow by zeroing gradients that are
            close to 0. Gradient thresholding is not applied if `None` is passed.
        """

        # Check that module has been passed latents with gradients
        if not torch.is_grad_enabled():
            raise ValueError("ClassifierGuidance requires torch.is_grad_enabled() to be true")
        if not _latents.requires_grad:
            raise ValueError("ClassifierGuidance expected _latents to have requires_grad = true")
        if _conditional_noise.grad_fn is None:
            raise ValueError("ClassifierGuidance expected _conditional_noise to have a non None grad_fn")

        if (_gconditional_noise is None) and (g_m is not None):
            raise ValueError(f"_gconditional_noise is `None`, but `g_m` is {g_m}, expected `g_m = {None}`")

        if isinstance(target_idx, int):
            target_idx = [target_idx]

        if augmentations == "recommended":
            augmentations = [TenCrop(0.6 * self.config.size[0]), Grayscale(3)]
        latents = _latents

        if grad_zero_threshold is not None:
            _latents.register_hook(lambda grad: torch.where(grad.abs() > grad_zero_threshold, grad, 0.0))

        if "generator" in _extra_step_kwargs:
            generator = _extra_step_kwargs["generator"]
        else:
            generator = None

        # Get the diffusion step standard deviation
        noise_sd = _pipeline.scheduler.add_noise(
            torch.tensor([0], dtype=latents.dtype),
            torch.tensor([1], dtype=latents.dtype),
            torch.tensor([_t], dtype=_t.dtype),
        )
        noise_sd = noise_sd.to(dtype=latents.dtype, device=latents.device)

        class_vec = -_unconditional_noise + _conditional_noise
        if g_m is None:
            noise = _unconditional_noise + g_w * class_vec
        else:
            adv_vec = -_unconditional_noise + _gconditional_noise
            noise = _unconditional_noise + g_w * class_vec + g_m * (-g_w * class_vec + g_p * adv_vec)

        if (classifier_guidance_bounds[0] < _t) and (_t < classifier_guidance_bounds[1]):
            # Get the estimated original image
            try:
                pred_latent = _pipeline.scheduler.step(
                    noise, _t, latents, **_extra_step_kwargs, return_dict=True
                ).pred_original_sample
                pred_latent = pred_latent.to(latents.dtype)  # Recast for compatibility issues
            except AttributeError:
                raise TypeError(
                    f"Diffusion pipeline scheduler: `{type(_pipeline.scheduler)}` does not provide original "
                    "sample predictions. Try another scheduler such as "
                    "`<diffusers.schedulers.scheduling_ddim.DDIMScheduler>`"
                )
            if hasattr(_pipeline.scheduler, "_step_index"):
                _pipeline.scheduler._step_index -= 1  # Reset the step for use in the diffusion pipeline

            # Classifier prediction
            image = self._latents_to_image(_pipeline.vae, pred_latent, generator)
            processed_image = self.preprocessor(image)

            ## Try to handle latent and classifier dtype mismatch
            try:
                logits = self._do_classifier(processed_image, augmentations=augmentations)
            except:
                if next(self.classifier.parameters()).dtype != processed_image.dtype:
                    processed_image = processed_image.to(next(self.classifier.parameters()).dtype)
                logits = self._do_classifier(processed_image, augmentations=augmentations)
            log_prob = torch.log_softmax(logits, dim=1)

            # Compute the gradient accounting for CPU offloading
            if _pipeline.unet.device != _pipeline.vae.device:
                original_unet_device = _pipeline.unet.device
                _pipeline.unet.to(_pipeline.vae.device)
                # Batches are independent, so individual gradients will be computed
                torch.sum(log_prob[torch.arange(len(target_idx)), target_idx]).backward()
                _pipeline.unet.to(original_unet_device)
            else:
                # Batches are independent, so individual gradients will be computed
                torch.sum(log_prob[torch.arange(len(target_idx)), target_idx]).backward()

            # Compute the noise estimate
            if grad_norm is not None:
                grad = torch.nn.functional.normalize(_latents.grad, p=grad_norm, dim=(-3, -2, -1))
            else:
                grad = _latents.grad

            noise = noise - g_s * noise_sd * grad

        return noise

    @staticmethod
    def do_gcontrol(g_w: float, g_m: float, g_s: float, **kwargs):
        return (g_w != 1) or (g_m != 0) or (g_s != 0)

    def _latents_to_image(self, vae: AutoencoderKL, latents: torch.Tensor, generator: torch.Generator) -> torch.Tensor:
        """
        :param vae: AutoencoderKL
            The variational autoencoder used by the diffusion pipeline.
        :param latents: torch.Tensor
            (B, 4, h, w) The latents of the diffusion model.

        Converts the latents to a tensor of RGB values (B, 3, h, w). Note that this function does not handle upcasting
        of the VAE or latents. This is to support autograd for downstream tasks. The forward method should handle vae
        upcasting/downcasting.
        """

        image = vae.decode(latents / vae.config.scaling_factor, return_dict=False, generator=generator)[0]
        image = (image / 2 + 0.5).clamp(min=0, max=1) * 255

        return image

    def _do_classifier(
        self, image: torch.Tensor, augmentations: Optional[Union[list[Callable], Callable]] = None
    ) -> torch.Tensor:
        """
        :param image: torch.Tensor
            (B, 3, h, w) A batch of images.
        :param augmentations: list of functions or function
            Optional augmentation(s) to be applied to the images prior to classification. The classification
            logits are averaged across the predictions for each transformation and the unaugmented image. If `None`,
            then no transformations are applied. If an augmentation changes the shape of the image, then it will be
            resized using `torchvision.transforms.Resize`.

        Returns a tensor of shape (B, S) containing the classification logits for each class, where `S` is the total
        number of classes. Note that logits are averaged across transformations if applied.
        """

        if augmentations is not None:
            original_shape = image.shape[-2:]
            if not isinstance(augmentations, list):
                augmentations = [augmentations]
            augmentations.append(lambda x: x)  # Add no augmentations func to augmentations list

            logits = []
            augmented_image = []
            for aug in augmentations:
                tmpim = aug(image)
                if isinstance(tmpim, torch.Tensor):
                    augmented_image.append(tmpim)
                elif isinstance(tmpim, list):  # Handling augmentations that return tuple or lists of augmented images
                    augmented_image += tmpim
                elif isinstance(tmpim, tuple):
                    augmented_image += list(tmpim)
                else:
                    raise TypeError("Unsupported augmentation return type.")
            for i, im in enumerate(augmented_image):
                if isinstance(im, torch.Tensor):
                    if im.shape[-2:] != original_shape:
                        im = Resize(original_shape)(im)  # Resizing augmented image if needed
                else:
                    raise TypeError("Unsupported augmentation return type.")

                classifier_output = self.classifier(im)
                if (not isinstance(classifier_output, torch.Tensor)) and hasattr(classifier_output, "logits"):
                    classifier_output = classifier_output.logits
                logits.append(classifier_output)

            logits = torch.stack(logits, dim=1)
            logits = torch.mean(logits, dim=1)  # Average over transforms
        else:
            classifier_output = self.classifier(image)
            if (not isinstance(classifier_output, torch.Tensor)) and hasattr(classifier_output, "logits"):
                classifier_output = classifier_output.logits
            logits = classifier_output
        return logits

    @property
    def requires_latent_grad(self):
        """
        Whether the the guidance controller requires the latents to have has gradients calculated through the UNet.
        """

        return self._requires_latent_grad


#######################################################################################################################
