"""
Based on https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
and https://github.com/ehoogeboom/multinomial_diffusion
"""

import torch.nn.functional as F
import torch
import math
from tqdm.notebook import tqdm

import numpy as np
from tab_geodiff.models.utils import *

"""
Based in part on: https://github.com/lucidrains/denoising-diffusion-pytorch/blob/5989f4c77eafcdc6be0fb4739f0f277a6dd7f7d8/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L281
"""
eps = 1e-8

def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
    """
    Get a pre-defined beta schedule for the given name.
    The beta schedule library consists of beta schedules which remain similar
    in the limit of num_diffusion_timesteps.
    Beta schedules may be added, but should not be removed or changed once
    they are committed to maintain backwards compatibility.
    """
    if schedule_name == "linear":
        # Linear schedule from Ho et al, extended to work for any number of
        # diffusion steps.
        scale = 1000 / num_diffusion_timesteps
        beta_start = scale * 0.0001
        beta_end = scale * 0.02
        return np.linspace(
            beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64
        )
    elif schedule_name == "cosine":
        return betas_for_alpha_bar(
            num_diffusion_timesteps,
            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
        )
    
    elif schedule_name == "snr":
        # SNR decays like (1 - t / T)^k where k controls the sharpness of decay
        return snr_based_beta_schedule(
            num_diffusion_timesteps
        )
    
    elif schedule_name == "quadratic":
        # Quadratic increase from small beta to larger beta
        beta_start = 0.0001
        beta_end = 0.02
        return np.linspace(
            beta_start ** 0.5, beta_end ** 0.5, num_diffusion_timesteps, dtype=np.float64
        ) ** 2

    elif schedule_name == "sigmoid":
        # Sigmoid-shaped schedule
        betas = np.linspace(-6, 6, num_diffusion_timesteps)
        return 1 / (1 + np.exp(-betas)) * (0.02 - 0.0001) + 0.0001
    
    elif schedule_name == "logarithmic":
        eps = 1e-5
        x = np.linspace(1, num_diffusion_timesteps, num_diffusion_timesteps)
        log_scaled = np.log(x + eps) / np.log(num_diffusion_timesteps + eps)
        return log_scaled * (0.02 - 0.0001) + 0.0001
    
    elif schedule_name == "cosine_param":
        s = 0.008
        gamma = 1.2
        steps = num_diffusion_timesteps
        t = np.linspace(0, steps, steps + 1) / steps
        alpha_bar = (np.cos((t + s) / (1 + s) * np.pi / 2)) ** (2 * gamma)
        betas = np.clip(1 - (alpha_bar[1:] / alpha_bar[:-1]), 0.0001, 0.999)
        return betas
    
    elif schedule_name == "snr_decay":
        # Desired SNR profile: start high, decay logarithmically
        min_snr = 0.1
        max_snr = 100
        snr = np.logspace(np.log10(max_snr), np.log10(min_snr), num_diffusion_timesteps)

        # Recover alpha^2 from SNR_t = alpha^2 / (1 - alpha^2)
        alpha_squared = snr / (1 + snr)
        alpha_bar = np.cumprod(alpha_squared[::-1])[::-1]  # alpha_t = product of (1 - beta)
        alpha_bar = alpha_bar / alpha_bar[0]  # normalize to start from 1

        # Compute betas: β_t = 1 - (ᾱ_{t+1} / ᾱ_t)
        betas = np.clip(1 - (alpha_bar[1:] / alpha_bar[:-1]), 0.0001, 0.999)
        return betas


    else:
        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")


def snr_based_beta_schedule(num_diffusion_timesteps, min_snr=0.01, max_snr=20):
    """
    SNR-based schedule that maintains stable gradient flow
    :param min_snr: Minimum signal-to-noise ratio (controls final noise level)
    :param max_snr: Maximum signal-to-noise ratio (controls initial noise)
    """
    t = np.linspace(0, 1, num_diffusion_timesteps)
    
    # Inverse sigmoid schedule for SNR
    snr_values = max_snr * (1 - 1/(1 + np.exp(-10*(t-0.5)))) + min_snr
    
    # Convert SNR to beta values
    alphas_cumprod = snr_values / (1 + snr_values)
    alphas = alphas_cumprod[1:] / alphas_cumprod[:-1]
    betas = 1 - alphas
    betas = np.clip(betas, 0, 0.999)
    betas = np.concatenate([betas, [0.999]])  # Ensure last beta is close to 1
    
    return betas

def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
    """
    Create a beta schedule that discretizes the given alpha_t_bar function,
    which defines the cumulative product of (1-beta) over time from t = [0,1].
    :param num_diffusion_timesteps: the number of betas to produce.
    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
                      produces the cumulative product of (1-beta) up to that
                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """
    betas = []
    for i in range(num_diffusion_timesteps):
        t1 = i / num_diffusion_timesteps
        t2 = (i + 1) / num_diffusion_timesteps
        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
    return np.array(betas)

# def cat2psk(tensor, categories):
#     device = tensor.device
#     tensor = tensor.int()
#     cat_spacings = 2 * math.pi / torch.tensor(categories).to(device)
    
#     phases = tensor * cat_spacings
#     psk_enc = torch.cat((torch.sin(phases), torch.cos(phases)), dim=-1)
#     return psk_enc

# def psk2cat(psk_tensor, categories):
#     device = psk_tensor.device
#     cat_len = categories.shape[0]
#     cat_spacings = 2 * math.pi / torch.tensor(categories).to(device)
    
#     sin_phases = psk_tensor[:, :cat_len]
#     cos_phases = psk_tensor[:, cat_len:2*cat_len]
    
#     phases = torch.atan2(sin_phases, cos_phases)
#     phases = (phases + 2 * math.pi) % (2 * math.pi)
    
#     feature_cat = torch.round(phases / cat_spacings).long()

#     total = feature_cat.numel()
#     corrected = 0

#     for i, cat in enumerate(categories):
#         corrected += (feature_cat[:, i] >= cat).sum().item()
#         feature_cat[:, i] = torch.where(feature_cat[:, i] >= cat, 0, feature_cat[:, i])

#     if total > 0:
#         casting_rate = corrected / total
#         print(f"Casting rate in psk2cat: {round(casting_rate, 3)}")
    
#     return feature_cat





def generalized_spiral(K, device='cpu'):
    """Optimized uniform spherical distribution"""
    indices = torch.arange(K, dtype=torch.float32, device=device)
    h = -1 + 2 * (indices) / (K - 1)
    theta = torch.acos(h)
    phi = torch.zeros(K, device=device)
    
    # Golden angle increment
    golden_angle = math.pi * (3 - math.sqrt(5))
    phi[1:] = (phi[1:] + golden_angle * indices[1:]) % (2*math.pi)



    return torch.stack([
        torch.sin(theta) * torch.cos(phi),
        torch.sin(theta) * torch.sin(phi),
        torch.cos(theta)
    ], dim=1)


def fibonacci_sphere(K, device='cpu'):
    """
    Generate K nearly-uniform points on a sphere using Fibonacci sphere method.
    
    Args:
        K: Number of points (categories)
        device: Target device for the tensor
        
    Returns:
        Tensor of shape (K, 3) with Cartesian coordinates on the unit sphere
    """
    indices = torch.arange(K, dtype=torch.float32, device=device)
    phi = math.pi * (3. - math.sqrt(5.))  # Golden angle

    y = 1 - (indices / (K - 1)) * 2  # y goes from 1 to -1
    radius = torch.sqrt(1 - y ** 2)  # Radius at height y

    theta = phi * indices

    x = torch.cos(theta) * radius
    z = torch.sin(theta) * radius

    return torch.stack([x, y, z], dim=1)  # shape (K, 3)




def cat2psk(tensor, categories, precomputed_refs=None):
    """
    Optimized encoder with reference point caching.
    
    Args:
        tensor: (B, num_features) int tensor of category indices
        categories: List[int] of category counts
        precomputed_refs: Optional cached reference points
        
    Returns:
        encoded: (B, 3*num_features) tensor
        ref_points: Cached reference points for decoding
    """
    device = tensor.device
    batch_size, num_features = tensor.shape
    encoded = torch.zeros((batch_size, 3*num_features), device=device)
    
    # Generate or reuse reference points
    if precomputed_refs is None:
        precomputed_refs = [fibonacci_sphere(K, device) for K in categories]
    
    # Vectorized encoding
    for i in range(num_features):
        start, end = 3*i, 3*(i+1)
        encoded[:, start:end] = precomputed_refs[i][tensor[:, i].long()]
    
    return encoded, precomputed_refs

def psk2cat(psk_tensor, categories, precomputed_refs=None, missing_val=-1, eps=1e-6):
    """
    Optimized decoder with batched distance computation.
    
    Args:
        psk_tensor: (B, 3*num_features) encoded tensor
        categories: List[int] of category counts
        precomputed_refs: Cached reference points from encoding
        missing_val: Value for missing entries
        eps: Missing value threshold
        
    Returns:
        decoded: (B, num_features) category indices
    """
    device = psk_tensor.device
    batch_size = psk_tensor.shape[0]
    decoded = torch.full((batch_size, len(categories)), missing_val, 
                        dtype=torch.long, device=device)
    
    # Reshape for vectorized processing
    psk_3d = psk_tensor.view(batch_size, -1, 3)  # (B, num_features, 3)
    norms = torch.norm(psk_3d, dim=2)
    valid_mask = norms >= eps
    
    # Generate references if not provided
    if precomputed_refs is None:
        precomputed_refs = [fibonacci_sphere(K, device) for K in categories]
    
    # Process all features in single loop
    for i, (K, refs) in enumerate(zip(categories, precomputed_refs)):
        mask = valid_mask[:, i]
        if not mask.any():
            continue
            
        # Batched distance computation
        dists = torch.cdist(psk_3d[mask, i], refs)
        decoded[mask, i] = torch.argmin(dists, dim=1)
    
    return decoded







def compute_smoothness(x):
    diff = x[:, 1:] - x[:, :-1]  # Finite differences across columns
    return torch.norm(diff, p='fro').item()

def adaptive_mollification(x_cat, initial_epsilon=0.01, smoothness_factor=0.5, max_iters=10):
    # Step 1: Measure initial smoothness
    initial_smoothness = compute_smoothness(x_cat)
    target_smoothness = smoothness_factor * initial_smoothness
    print(f"Initial smoothness: {initial_smoothness:.4f}, Target smoothness: {target_smoothness:.4f}")
    epsilon = initial_epsilon
    for i in range(max_iters):
        # Apply mollification
        x_cat_smooth = x_cat + epsilon * torch.randn_like(x_cat)
        
        # Compute current smoothness
        smoothness = compute_smoothness(x_cat_smooth)
        print(f"Iteration {i+1}: epsilon={epsilon:.4f}, smoothness={smoothness:.4f}")
        
        # Check if smoothness condition is satisfied
        if smoothness < target_smoothness:
            print("Smoothness criterion satisfied.")
            return x_cat_smooth
        
        # Increase epsilon adaptively
        epsilon *= 1.5  # Increase epsilon progressively
    
    print("Max iterations reached. Returning last mollified result.")
    return x_cat_smooth

def gaussian_smooth(x, sigma=1.0, kernel_size=5):
    # Create a 1D Gaussian kernel
    coords = torch.arange(kernel_size) - kernel_size // 2
    g = torch.exp(-(coords**2) / (2*sigma**2))
    g = g / g.sum()  # Normalize kernel
    g = g.to(x.device)

    # Reshape to (1,1,kernel_size) for depthwise conv
    g = g.unsqueeze(0).unsqueeze(0)
    # Apply convolution along the features dimension
    # Assuming x is (batch, features), add dummy dimension for conv
    x = x.unsqueeze(1)  
    x_smooth = F.conv1d(x, g, padding=kernel_size//2)
    x_smooth = x_smooth.squeeze(1)
    return x_smooth

class GaussianMultinomialDiffusion(torch.nn.Module):
    def __init__(
            self,
            num_classes: np.array,
            num_numerical_features: int,
            denoise_fn,
            num_timesteps=2000,
            gaussian_loss_type='mse',
            gaussian_parametrization='eps',
            multinomial_loss_type='vb_stochastic',
            parametrization='x0',
            scheduler='cosine',
            device=torch.device('cpu'),
            num_bits_per_cat_feature=0
        ):

        super(GaussianMultinomialDiffusion, self).__init__()
        assert multinomial_loss_type in ('vb_stochastic', 'vb_all')
        assert parametrization in ('x0', 'direct')

        if multinomial_loss_type == 'vb_all':
            print('Computing the loss using the bound on _all_ timesteps.'
                  ' This is expensive both in terms of memory and computation.')

        self.num_numerical_features = num_numerical_features
        self.num_classes = num_classes  # it as a vector [K1, K2, ..., Km]
        
        # Handle case when there are no categorical features
        if len(num_classes) == 0 or num_classes.sum() == 0:
            self.num_classes = np.array([])
            self.num_classes_expanded = torch.empty(0).to(device)
            self.slices_for_classes = []
            self.offsets = torch.tensor([0]).to(device)
        else:
            self.num_classes_expanded = torch.from_numpy(
                np.concatenate([num_classes[i].repeat(num_classes[i]) for i in range(len(num_classes))])
            ).to(device)

            self.slices_for_classes = [np.arange(self.num_classes[0])]
            offsets = np.cumsum(self.num_classes)
            for i in range(1, len(offsets)):
                self.slices_for_classes.append(np.arange(offsets[i - 1], offsets[i]))
            self.offsets = torch.from_numpy(np.append([0], offsets)).to(device)

        self._denoise_fn = denoise_fn
        self.gaussian_loss_type = gaussian_loss_type
        self.gaussian_parametrization = gaussian_parametrization
        self.multinomial_loss_type = multinomial_loss_type
        self.num_timesteps = num_timesteps
        self.parametrization = parametrization
        self.scheduler = scheduler
        self.num_bits_per_cat_feature = num_bits_per_cat_feature

        # Rest of your initialization code...
        alphas = 1. - get_named_beta_schedule(scheduler, num_timesteps)
        alphas = torch.tensor(alphas.astype('float64')) # alpha2_t
        betas = 1. - alphas    # beta2_t
        
        # ... rest of your __init__ method ...

        log_alpha = np.log(alphas)
        log_cumprod_alpha = np.cumsum(log_alpha)

        log_1_min_alpha = log_1_min_a(log_alpha)
        log_1_min_cumprod_alpha = log_1_min_a(log_cumprod_alpha)

        alphas_cumprod = np.cumprod(alphas, axis=0) # tilde_alpha2_t
        alphas_cumprod_prev = torch.tensor(np.append(1.0, alphas_cumprod[:-1]))  # tilde_alpha2_{t-1}
        alphas_cumprod_next = torch.tensor(np.append(alphas_cumprod[1:], 0.0)) # tilde_alpha2_{t+1}
        sqrt_alphas_cumprod = np.sqrt(alphas_cumprod)    # tilde_alpha_t
        sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - alphas_cumprod) # tilde_beta_t
        sqrt_recip_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod)    # sqrt(1 / tilde_alpha_t)
        sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod - 1) # sqrt(tilde_beta_t / tilde_alpha_t )

        # Gaussian diffusion

        self.posterior_variance = (
            betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
        )
        self.posterior_log_variance_clipped = torch.from_numpy(
            np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:]))
        ).float().to(device)
        self.posterior_mean_coef1 = (
            betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
        ).float().to(device)
        self.posterior_mean_coef2 = (
            (1.0 - alphas_cumprod_prev)
            * np.sqrt(alphas.numpy())
            / (1.0 - alphas_cumprod)
        ).float().to(device)

        assert log_add_exp(log_alpha, log_1_min_alpha).abs().sum().item() < 1.e-5
        assert log_add_exp(log_cumprod_alpha, log_1_min_cumprod_alpha).abs().sum().item() < 1e-5
        assert (np.cumsum(log_alpha) - log_cumprod_alpha).abs().sum().item() < 1.e-5

        # Convert to float32 and register buffers.
        self.register_buffer('alphas', alphas.float().to(device))
        self.register_buffer('log_alpha', log_alpha.float().to(device))
        self.register_buffer('log_1_min_alpha', log_1_min_alpha.float().to(device))
        self.register_buffer('log_1_min_cumprod_alpha', log_1_min_cumprod_alpha.float().to(device))
        self.register_buffer('log_cumprod_alpha', log_cumprod_alpha.float().to(device))
        self.register_buffer('alphas_cumprod', alphas_cumprod.float().to(device))
        self.register_buffer('alphas_cumprod_prev', alphas_cumprod_prev.float().to(device))
        self.register_buffer('alphas_cumprod_next', alphas_cumprod_next.float().to(device))
        self.register_buffer('sqrt_alphas_cumprod', sqrt_alphas_cumprod.float().to(device))
        self.register_buffer('sqrt_one_minus_alphas_cumprod', sqrt_one_minus_alphas_cumprod.float().to(device))
        self.register_buffer('sqrt_recip_alphas_cumprod', sqrt_recip_alphas_cumprod.float().to(device))
        self.register_buffer('sqrt_recipm1_alphas_cumprod', sqrt_recipm1_alphas_cumprod.float().to(device))

        self.register_buffer('Lt_history', torch.zeros(num_timesteps))
        self.register_buffer('Lt_count', torch.zeros(num_timesteps))

        self.precomputed_refs = None

# class GaussianMultinomialDiffusion(torch.nn.Module):
#     def __init__(
#             self,
#             num_classes: np.array,
#             num_numerical_features: int,
#             denoise_fn,
#             num_timesteps=2000,
#             gaussian_loss_type='mse',
#             gaussian_parametrization='eps',
#             multinomial_loss_type='vb_stochastic',
#             parametrization='x0',
#             scheduler='cosine',
#             device=torch.device('cpu'),
#             num_bits_per_cat_feature=0
#         ):

#         super(GaussianMultinomialDiffusion, self).__init__()
#         assert multinomial_loss_type in ('vb_stochastic', 'vb_all')
#         assert parametrization in ('x0', 'direct')

#         if multinomial_loss_type == 'vb_all':
#             print('Computing the loss using the bound on _all_ timesteps.'
#                   ' This is expensive both in terms of memory and computation.')

#         self.num_numerical_features = num_numerical_features
#         self.num_classes = num_classes # it as a vector [K1, K2, ..., Km]
#         # self.num_classes = 0
#         print('self.num_classes',self.num_classes)

#         self.num_classes_expanded = torch.from_numpy(
            

#             np.concatenate([num_classes[i].repeat(num_classes[i]) for i in range(len(num_classes))])
#         ).to(device)

#         self.slices_for_classes = [np.arange(self.num_classes[0])]
#         offsets = np.cumsum(self.num_classes)
#         for i in range(1, len(offsets)):
#             self.slices_for_classes.append(np.arange(offsets[i - 1], offsets[i]))
#         self.offsets = torch.from_numpy(np.append([0], offsets)).to(device)

#         self._denoise_fn = denoise_fn
#         self.gaussian_loss_type = gaussian_loss_type
#         self.gaussian_parametrization = gaussian_parametrization
#         self.multinomial_loss_type = multinomial_loss_type
#         self.num_timesteps = num_timesteps
#         self.parametrization = parametrization
#         self.scheduler = scheduler
#         self.num_bits_per_cat_feature=num_bits_per_cat_feature

#         alphas = 1. - get_named_beta_schedule(scheduler, num_timesteps)
#         alphas = torch.tensor(alphas.astype('float64')) # alpha2_t
#         betas = 1. - alphas    # beta2_t

#         log_alpha = np.log(alphas)
#         log_cumprod_alpha = np.cumsum(log_alpha)

#         log_1_min_alpha = log_1_min_a(log_alpha)
#         log_1_min_cumprod_alpha = log_1_min_a(log_cumprod_alpha)

#         alphas_cumprod = np.cumprod(alphas, axis=0) # tilde_alpha2_t
#         alphas_cumprod_prev = torch.tensor(np.append(1.0, alphas_cumprod[:-1]))  # tilde_alpha2_{t-1}
#         alphas_cumprod_next = torch.tensor(np.append(alphas_cumprod[1:], 0.0)) # tilde_alpha2_{t+1}
#         sqrt_alphas_cumprod = np.sqrt(alphas_cumprod)    # tilde_alpha_t
#         sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - alphas_cumprod) # tilde_beta_t
#         sqrt_recip_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod)    # sqrt(1 / tilde_alpha_t)
#         sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / alphas_cumprod - 1) # sqrt(tilde_beta_t / tilde_alpha_t )

#         # Gaussian diffusion

#         self.posterior_variance = (
#             betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
#         )
#         self.posterior_log_variance_clipped = torch.from_numpy(
#             np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:]))
#         ).float().to(device)
#         self.posterior_mean_coef1 = (
#             betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)
#         ).float().to(device)
#         self.posterior_mean_coef2 = (
#             (1.0 - alphas_cumprod_prev)
#             * np.sqrt(alphas.numpy())
#             / (1.0 - alphas_cumprod)
#         ).float().to(device)

#         assert log_add_exp(log_alpha, log_1_min_alpha).abs().sum().item() < 1.e-5
#         assert log_add_exp(log_cumprod_alpha, log_1_min_cumprod_alpha).abs().sum().item() < 1e-5
#         assert (np.cumsum(log_alpha) - log_cumprod_alpha).abs().sum().item() < 1.e-5

#         # Convert to float32 and register buffers.
#         self.register_buffer('alphas', alphas.float().to(device))
#         self.register_buffer('log_alpha', log_alpha.float().to(device))
#         self.register_buffer('log_1_min_alpha', log_1_min_alpha.float().to(device))
#         self.register_buffer('log_1_min_cumprod_alpha', log_1_min_cumprod_alpha.float().to(device))
#         self.register_buffer('log_cumprod_alpha', log_cumprod_alpha.float().to(device))
#         self.register_buffer('alphas_cumprod', alphas_cumprod.float().to(device))
#         self.register_buffer('alphas_cumprod_prev', alphas_cumprod_prev.float().to(device))
#         self.register_buffer('alphas_cumprod_next', alphas_cumprod_next.float().to(device))
#         self.register_buffer('sqrt_alphas_cumprod', sqrt_alphas_cumprod.float().to(device))
#         self.register_buffer('sqrt_one_minus_alphas_cumprod', sqrt_one_minus_alphas_cumprod.float().to(device))
#         self.register_buffer('sqrt_recip_alphas_cumprod', sqrt_recip_alphas_cumprod.float().to(device))
#         self.register_buffer('sqrt_recipm1_alphas_cumprod', sqrt_recipm1_alphas_cumprod.float().to(device))

#         self.register_buffer('Lt_history', torch.zeros(num_timesteps))
#         self.register_buffer('Lt_count', torch.zeros(num_timesteps))

#         self.precomputed_refs = None
    
    # Gaussian part
    def gaussian_q_mean_variance(self, x_start, t):
        mean = (
            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
        )
        variance = extract(1.0 - self.alphas_cumprod, t, x_start.shape)
        log_variance = extract(
            self.log_1_min_cumprod_alpha, t, x_start.shape
        )
        return mean, variance, log_variance
    
    def gaussian_q_sample(self, x_start, t, noise=None):
        if noise is None:
            noise = torch.randn_like(x_start)
        assert noise.shape == x_start.shape
        return (
            extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
            + extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
            * noise
        )
    
    def gaussian_q_posterior_mean_variance(self, x_start, x_t, t):
        assert x_start.shape == x_t.shape
        posterior_mean = (
            extract(self.posterior_mean_coef1, t, x_t.shape) * x_start
            + extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
        )
        posterior_variance = extract(self.posterior_variance, t, x_t.shape)
        posterior_log_variance_clipped = extract(
            self.posterior_log_variance_clipped, t, x_t.shape
        )
        assert (
            posterior_mean.shape[0]
            == posterior_variance.shape[0]
            == posterior_log_variance_clipped.shape[0]
            == x_start.shape[0]
        )
        return posterior_mean, posterior_variance, posterior_log_variance_clipped

    def gaussian_p_mean_variance(
        self, model_output, x, t, clip_denoised=False, denoised_fn=None, model_kwargs=None
    ):
        if model_kwargs is None:
            model_kwargs = {}

        B, C = x.shape[:2]
        assert t.shape == (B,)

        model_variance = torch.cat([self.posterior_variance[1].unsqueeze(0).to(x.device), (1. - self.alphas)[1:]], dim=0)
        # model_variance = self.posterior_variance.to(x.device)
        model_log_variance = torch.log(model_variance)

        model_variance = extract(model_variance, t, x.shape)
        model_log_variance = extract(model_log_variance, t, x.shape)


        if self.gaussian_parametrization == 'eps':
            pred_xstart = self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output)
        elif self.gaussian_parametrization == 'x0':
            pred_xstart = model_output
        else:
            raise NotImplementedError
            
        model_mean, _, _ = self.gaussian_q_posterior_mean_variance(
            x_start=pred_xstart, x_t=x, t=t
        )

        assert (
            model_mean.shape == model_log_variance.shape == pred_xstart.shape == x.shape
        ), f'{model_mean.shape}, {model_log_variance.shape}, {pred_xstart.shape}, {x.shape}'

        return {
            "mean": model_mean,
            "variance": model_variance,
            "log_variance": model_log_variance,
            "pred_xstart": pred_xstart,
        }
    
    def _vb_terms_bpd(
        self, model_output, x_start, x_t, t, clip_denoised=False, model_kwargs=None
    ):
        true_mean, _, true_log_variance_clipped = self.gaussian_q_posterior_mean_variance(
            x_start=x_start, x_t=x_t, t=t
        )
        out = self.gaussian_p_mean_variance(
            model_output, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs
        )
        kl = normal_kl(
            true_mean, true_log_variance_clipped, out["mean"], out["log_variance"]
        )
        kl = mean_flat(kl) / np.log(2.0)

        decoder_nll = -discretized_gaussian_log_likelihood(
            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
        )
        assert decoder_nll.shape == x_start.shape
        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)

        # At the first timestep return the decoder NLL,
        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
        output = torch.where((t == 0), decoder_nll, kl)
        return {"output": output, "pred_xstart": out["pred_xstart"], "out_mean": out["mean"], "true_mean": true_mean}
    
    def _prior_gaussian(self, x_start):
        """
        Get the prior KL term for the variational lower-bound, measured in
        bits-per-dim.

        This term can't be optimized, as it only depends on the encoder.

        :param x_start: the [N x C x ...] tensor of inputs.
        :return: a batch of [N] KL values (in bits), one per batch element.
        """
        batch_size = x_start.shape[0]
        t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
        qt_mean, _, qt_log_variance = self.gaussian_q_mean_variance(x_start, t)
        kl_prior = normal_kl(
            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0
        )
        return mean_flat(kl_prior) / np.log(2.0)
    
    def _gaussian_loss(self, model_out, x_start, x_t, t, noise, model_kwargs=None):
        if model_kwargs is None:
            model_kwargs = {}

        terms = {}
        if self.gaussian_loss_type == 'mse':
            terms["loss"] = mean_flat((noise - model_out) ** 2)
        elif self.gaussian_loss_type == 'kl':
            terms["loss"] = self._vb_terms_bpd(
                model_output=model_out,
                x_start=x_start,
                x_t=x_t,
                t=t,
                clip_denoised=False,
                model_kwargs=model_kwargs,
            )["output"]


        return terms['loss']
    
    def _predict_xstart_from_eps(self, x_t, t, eps):
        assert x_t.shape == eps.shape
        return (
            extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
            - extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
        )
    
    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
        return (
            extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
            - pred_xstart
        ) / extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)

    def gaussian_p_sample(
        self,
        model_out,
        x,
        t,
        clip_denoised=False,
        denoised_fn=None,
        model_kwargs=None,
    ):
        out = self.gaussian_p_mean_variance(
            model_out,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=model_kwargs,
        )
        noise = torch.randn_like(x)
        nonzero_mask = (
            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
        )  # no noise when t == 0

        sample = out["mean"] + nonzero_mask * torch.exp(0.5 * out["log_variance"]) * noise
        sample = sample.float()
        return {"sample": sample, "pred_xstart": out["pred_xstart"]}

    # Multinomial part

    def multinomial_kl(self, log_prob1, log_prob2):

        kl = (log_prob1.exp() * (log_prob1 - log_prob2)).sum(dim=1)

        return kl

    def q_pred_one_timestep(self, log_x_t, t):
        log_alpha_t = extract(self.log_alpha, t, log_x_t.shape)
        log_1_min_alpha_t = extract(self.log_1_min_alpha, t, log_x_t.shape)

        # alpha_t * E[xt] + (1 - alpha_t) 1 / K
        log_probs = log_add_exp(
            log_x_t + log_alpha_t,
            log_1_min_alpha_t - torch.log(self.num_classes_expanded)
        )

        return log_probs

    def q_pred(self, log_x_start, t):
        log_cumprod_alpha_t = extract(self.log_cumprod_alpha, t, log_x_start.shape)
        log_1_min_cumprod_alpha = extract(self.log_1_min_cumprod_alpha, t, log_x_start.shape)

        log_probs = log_add_exp(
            log_x_start + log_cumprod_alpha_t,
            log_1_min_cumprod_alpha - torch.log(self.num_classes_expanded)
        )

        return log_probs

    def predict_start(self, model_out, log_x_t, t):


        assert model_out.size(0) == log_x_t.size(0)
        assert model_out.size(1) == self.num_classes.sum(), f'{model_out.size()}'

        log_pred = torch.empty_like(model_out)
        for ix in self.slices_for_classes:
            log_pred[:, ix] = F.log_softmax(model_out[:, ix], dim=1)
        return log_pred

    def q_posterior(self, log_x_start, log_x_t, t):
        # q(xt-1 | xt, x0) = q(xt | xt-1, x0) * q(xt-1 | x0) / q(xt | x0)
        # where q(xt | xt-1, x0) = q(xt | xt-1).

        # EV_log_qxt_x0 = self.q_pred(log_x_start, t)

        # print('sum exp', EV_log_qxt_x0.exp().sum(1).mean())
        # assert False

        # log_qxt_x0 = (log_x_t.exp() * EV_log_qxt_x0).sum(dim=1)
        t_minus_1 = t - 1
        # Remove negative values, will not be used anyway for final decoder
        t_minus_1 = torch.where(t_minus_1 < 0, torch.zeros_like(t_minus_1), t_minus_1)
        log_EV_qxtmin_x0 = self.q_pred(log_x_start, t_minus_1)

        num_axes = (1,) * (len(log_x_start.size()) - 1)
        t_broadcast = t.to(log_x_start.device).view(-1, *num_axes) * torch.ones_like(log_x_start)
        log_EV_qxtmin_x0 = torch.where(t_broadcast == 0, log_x_start, log_EV_qxtmin_x0.to(torch.float32))

        # unnormed_logprobs = log_EV_qxtmin_x0 +
        #                     log q_pred_one_timestep(x_t, t)
        # Note: _NOT_ x_tmin1, which is how the formula is typically used!!!
        # Not very easy to see why this is true. But it is :)
        unnormed_logprobs = log_EV_qxtmin_x0 + self.q_pred_one_timestep(log_x_t, t)

        sliced = sliced_logsumexp(unnormed_logprobs, self.offsets)
        log_EV_xtmin_given_xt_given_xstart = unnormed_logprobs - sliced

        return log_EV_xtmin_given_xt_given_xstart

    def p_pred(self, model_out, log_x, t):
        if self.parametrization == 'x0':
            log_x_recon = self.predict_start(model_out, log_x, t=t)
            log_model_pred = self.q_posterior(
                log_x_start=log_x_recon, log_x_t=log_x, t=t)
        elif self.parametrization == 'direct':
            log_model_pred = self.predict_start(model_out, log_x, t=t)
        else:
            raise ValueError


        return log_model_pred

    @torch.no_grad()
    def p_sample(self, model_out, log_x, t):
        model_log_prob = self.p_pred(model_out, log_x=log_x, t=t)
        out = self.log_sample_categorical(model_log_prob)
        return out

    @torch.no_grad()
    def p_sample_loop(self, shape):
        device = self.log_alpha.device

        b = shape[0]
        # start with random normal image.
        img = torch.randn(shape, device=device)

        for i in reversed(range(1, self.num_timesteps)):
            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long))
        return img

    @torch.no_grad()
    def _sample(self, image_size, batch_size = 16):
        return self.p_sample_loop((batch_size, 3, image_size, image_size))

    @torch.no_grad()
    def interpolate(self, x1, x2, t = None, lam = 0.5):
        b, *_, device = *x1.shape, x1.device
        t = default(t, self.num_timesteps - 1)

        assert x1.shape == x2.shape

        t_batched = torch.stack([torch.tensor(t, device=device)] * b)
        xt1, xt2 = map(lambda x: self.q_sample(x, t=t_batched), (x1, x2))

        img = (1 - lam) * xt1 + lam * xt2
        for i in reversed(range(0, t)):
            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long))

        return img

    def log_sample_categorical(self, logits):
        full_sample = []
        for i in range(len(self.num_classes)):
            one_class_logits = logits[:, self.slices_for_classes[i]]
            uniform = torch.rand_like(one_class_logits)
            gumbel_noise = -torch.log(-torch.log(uniform + 1e-30) + 1e-30)
            sample = (gumbel_noise + one_class_logits).argmax(dim=1)
            full_sample.append(sample.unsqueeze(1))
        full_sample = torch.cat(full_sample, dim=1)
        log_sample = index_to_log_onehot(full_sample, self.num_classes)
        return log_sample

    def q_sample(self, log_x_start, t):
        log_EV_qxt_x0 = self.q_pred(log_x_start, t)

        log_sample = self.log_sample_categorical(log_EV_qxt_x0)

        return log_sample

    def nll(self, log_x_start):
        b = log_x_start.size(0)
        device = log_x_start.device
        loss = 0
        for t in range(0, self.num_timesteps):
            t_array = (torch.ones(b, device=device) * t).long()

            kl = self.compute_Lt(
                log_x_start=log_x_start,
                log_x_t=self.q_sample(log_x_start=log_x_start, t=t_array),
                t=t_array)

            loss += kl

        loss += self.kl_prior(log_x_start)

        return loss

    def kl_prior(self, log_x_start):
        b = log_x_start.size(0)
        device = log_x_start.device
        ones = torch.ones(b, device=device).long()

        log_qxT_prob = self.q_pred(log_x_start, t=(self.num_timesteps - 1) * ones)
        log_half_prob = -torch.log(self.num_classes_expanded * torch.ones_like(log_qxT_prob))

        kl_prior = self.multinomial_kl(log_qxT_prob, log_half_prob)

        return sum_except_batch(kl_prior)

    def compute_Lt(self, model_out, log_x_start, log_x_t, t, detach_mean=False):
        log_true_prob = self.q_posterior(
            log_x_start=log_x_start, log_x_t=log_x_t, t=t)
        log_model_prob = self.p_pred(model_out, log_x=log_x_t, t=t)

        if detach_mean:
            log_model_prob = log_model_prob.detach()

        kl = self.multinomial_kl(log_true_prob, log_model_prob)

        # if torch.isinf(kl).nonzero().shape[0] != 0:
        #     idx = torch.isinf(kl).nonzero()[0]
        #     print('KL 0 :', kl[idx])

        kl = sum_except_batch(kl)

        decoder_nll = -log_categorical(log_x_start, log_model_prob)
        decoder_nll = sum_except_batch(decoder_nll)

        mask = (t == torch.zeros_like(t)).float()
        loss = mask * decoder_nll + (1. - mask) * kl        

        return loss

    def sample_time(self, b, device, method='uniform'):
        if method == 'importance':
            if not (self.Lt_count > 10).all():
                return self.sample_time(b, device, method='uniform')

            Lt_sqrt = torch.sqrt(self.Lt_history + 1e-10) + 0.0001
            Lt_sqrt[0] = Lt_sqrt[1]  # Overwrite decoder term with L1.
            pt_all = (Lt_sqrt / Lt_sqrt.sum()).to(device)

            t = torch.multinomial(pt_all, num_samples=b, replacement=True).to(device)

            pt = pt_all.gather(dim=0, index=t)

            return t, pt

        elif method == 'uniform':
            t = torch.randint(0, self.num_timesteps, (b,), device=device).long()

            pt = torch.ones_like(t).float() / self.num_timesteps


            # u = torch.rand(b, device=device)  # Uniform in [0,1]
            # t = (u ** 2) * (self.num_timesteps - 1)  # Apply inverse warping (sqrt)
            # t = t.long().clamp(0, self.num_timesteps - 1)
            
            # # Compute probability density (accounting for warping)
            # pt = (0.5 / torch.sqrt(t.float() / (self.num_timesteps - 1) + 1e-8)) / self.num_timesteps
            return t, pt
        else:
            raise ValueError

    def _multinomial_loss(self, model_out, log_x_start, log_x_t, t, pt):

        if self.multinomial_loss_type == 'vb_stochastic':

            kl = self.compute_Lt(
                model_out, log_x_start, log_x_t, t
            )
            kl_prior = self.kl_prior(log_x_start)
            # Upweigh loss term of the kl

            vb_loss = kl / pt + kl_prior


            return vb_loss

        elif self.multinomial_loss_type == 'vb_all':
            # Expensive, dont do it ;).
            # DEPRECATED
            return -self.nll(log_x_start)
        else:
            raise ValueError()

    def log_prob(self, x):
        b, device = x.size(0), x.device
        if self.training:
            return self._multinomial_loss(x)

        else:
            log_x_start = index_to_log_onehot(x, self.num_classes)

            t, pt = self.sample_time(b, device, 'importance')

            kl = self.compute_Lt(
                log_x_start, self.q_sample(log_x_start=log_x_start, t=t), t)

            kl_prior = self.kl_prior(log_x_start)

            # Upweigh loss term of the kl
            loss = kl / (pt + 1e-6) + kl_prior

            return -loss
    
    @torch.no_grad()
    def loss_at_step_t(self, x, step):

        b = x.shape[0]
        device = x.device

        t = (torch.ones((b,)) * step).long().to(device)
        pt = torch.ones_like(t).float() / self.num_timesteps

        x_num = x[:, :self.num_numerical_features]
        x_cat = x[:, self.num_numerical_features:]
        
        x_num_t = x_num
        log_x_cat_t = x_cat
        if x_num.shape[1] > 0:
            noise = torch.randn_like(x_num)
            x_num_t = self.gaussian_q_sample(x_num, t, noise=noise)
        if x_cat.shape[1] > 0:
            log_x_cat = index_to_log_onehot(x_cat.long(), self.num_classes)
            log_x_cat_t = self.q_sample(log_x_start=log_x_cat, t=t)
        
        x_in = torch.cat([x_num_t, log_x_cat_t], dim=1)

        model_out = self._denoise_fn(
            x_in,
            t
        )

        model_out_num = model_out[:, :self.num_numerical_features]
        model_out_cat = model_out[:, self.num_numerical_features:]

        loss_multi = torch.zeros((1,)).float()
        loss_gauss = torch.zeros((1,)).float()
        if x_cat.shape[1] > 0:
            loss_multi = self._multinomial_loss(model_out_cat, log_x_cat, log_x_cat_t, t, pt) / len(self.num_classes)
        
        if x_num.shape[1] > 0:
            loss_gauss = self._gaussian_loss(model_out_num, x_num, x_num_t, t, noise)

        recon_x0_num = self.recon_x0(x_in, model_out, t)[:,:self.num_numerical_features]

        recon_loss = self._gaussian_loss(recon_x0_num, x_num, x_num_t, t, x_num)

        return loss_multi.mean(), loss_gauss.mean(), recon_loss.mean()
    
    @torch.no_grad()
    def recon_x0(self, x, model_out, t):
        # x_num = x[:, :self.num_numerical_features]

        x0 = extract(self.sqrt_recip_alphas_cumprod, t, x.shape) * (x - model_out * extract(self.sqrt_one_minus_alphas_cumprod, t, x.shape))
    
        return x0

    def mixed_loss(self, x):
        b = x.shape[0]
        device = x.device
        t, pt = self.sample_time(b, device, 'uniform')
        # print("num_bits_per_cat_feature " + str(self.num_bits_per_cat_feature))
        x_num = x[:, :self.num_numerical_features]
        x_cat = x[:, self.num_numerical_features:]
        # x_cat = cat2psk(x_cat, self.num_classes)

        if self.precomputed_refs == None:

        
            x_cat,precomputed_refs = cat2psk(x_cat, self.num_classes)

            self.precomputed_refs = precomputed_refs 
        
        
        else:
            x_cat,_ = cat2psk(x_cat, self.num_classes,precomputed_refs=self.precomputed_refs)

        x_bits = torch.cat([x_num, x_cat], dim=1)

        if x_bits.shape[1] > 0:
            noise = torch.randn_like(x_bits)
            x_bits_t = self.gaussian_q_sample(x_bits, t, noise=noise)

        # x_in = torch.cat([x_num_t, log_x_cat_t], dim=1)
        x_in = x_bits_t

        model_out = self._denoise_fn(
            x_in,
            t
        )

        # print(model_out.shape)

        model_out_num = model_out[:, :self.num_numerical_features]
        model_out_cat = model_out[:, self.num_numerical_features:]

        loss_multi = torch.zeros((1,)).float()
        loss_gauss = torch.zeros((1,)).float()

        # if x_cat.shape[1] > 0:
        #     loss_multi = self._multinomial_loss(model_out_cat, log_x_cat, log_x_cat_t, t, pt) / len(self.num_classes)
        
        # if x_num.shape[1] > 0:
        #     loss_gauss = self._gaussian_loss(model_out_num, x_num, x_num_t, t, noise)

        if x_bits.shape[1] > 0:
            loss_gauss = self._gaussian_loss(model_out, x_bits, x_bits_t, t, noise)


        return loss_multi.mean(), loss_gauss.mean()
    
    @torch.no_grad()
    def mixed_elbo(self, x0):
        b = x0.size(0)
        device = x0.device

        x_num = x0[:, :self.num_numerical_features]
        x_cat = x0[:, self.num_numerical_features:]
        has_cat = x_cat.shape[1] > 0
        if has_cat:
            log_x_cat = index_to_log_onehot(x_cat.long(), self.num_classes).to(device)

        gaussian_loss = []
        xstart_mse = []
        mse = []
        mu_mse = []
        out_mean = []
        true_mean = []
        multinomial_loss = []
        for t in range(self.num_timesteps):
            t_array = (torch.ones(b, device=device) * t).long()
            noise = torch.randn_like(x_num)

            x_num_t = self.gaussian_q_sample(x_start=x_num, t=t_array, noise=noise)
            if has_cat:
                log_x_cat_t = self.q_sample(log_x_start=log_x_cat, t=t_array)
            else:
                log_x_cat_t = x_cat

            model_out = self._denoise_fn(
                torch.cat([x_num_t, log_x_cat_t], dim=1),
                t_array
            )
            
            model_out_num = model_out[:, :self.num_numerical_features]
            model_out_cat = model_out[:, self.num_numerical_features:]

            kl = torch.tensor([0.0])
            if has_cat:
                kl = self.compute_Lt(
                    model_out=model_out_cat,
                    log_x_start=log_x_cat,
                    log_x_t=log_x_cat_t,
                    t=t_array
                )

            out = self._vb_terms_bpd(
                model_out_num,
                x_start=x_num,
                x_t=x_num_t,
                t=t_array,
                clip_denoised=False
            )

            multinomial_loss.append(kl)
            gaussian_loss.append(out["output"])
            xstart_mse.append(mean_flat((out["pred_xstart"] - x_num) ** 2))
            # mu_mse.append(mean_flat(out["mean_mse"]))
            out_mean.append(mean_flat(out["out_mean"]))
            true_mean.append(mean_flat(out["true_mean"]))

            eps = self._predict_eps_from_xstart(x_num_t, t_array, out["pred_xstart"])
            mse.append(mean_flat((eps - noise) ** 2))

        gaussian_loss = torch.stack(gaussian_loss, dim=1)
        multinomial_loss = torch.stack(multinomial_loss, dim=1)
        xstart_mse = torch.stack(xstart_mse, dim=1)
        mse = torch.stack(mse, dim=1)
        # mu_mse = torch.stack(mu_mse, dim=1)
        out_mean = torch.stack(out_mean, dim=1)
        true_mean = torch.stack(true_mean, dim=1)



        prior_gauss = self._prior_gaussian(x_num)

        prior_multin = torch.tensor([0.0])
        if has_cat:
            prior_multin = self.kl_prior(log_x_cat)

        total_gauss = gaussian_loss.sum(dim=1) + prior_gauss
        total_multin = multinomial_loss.sum(dim=1) + prior_multin
        return {
            "total_gaussian": total_gauss,
            "total_multinomial": total_multin,
            "losses_gaussian": gaussian_loss,
            "losses_multinimial": multinomial_loss,
            "xstart_mse": xstart_mse,
            "mse": mse,
            # "mu_mse": mu_mse
            "out_mean": out_mean,
            "true_mean": true_mean
        }

    @torch.no_grad()
    def gaussian_ddim_step(
        self,
        model_out_num,
        x,
        t,
        t_prev,
        clip_denoised=False,
        denoised_fn=None,
        eta=1.0
    ):
        out = self.gaussian_p_mean_variance(
            model_out_num,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=denoised_fn,
            model_kwargs=None,
        )

        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])

        alpha_bar = extract(self.alphas_cumprod, t, x.shape)
        
        if t[0] != 0:
            alpha_bar_prev = extract(self.alphas_cumprod, t_prev, x.shape)
        else:
            alpha_bar_prev = extract(self.alphas_cumprod_prev, t_prev, x.shape)
        
        sigma = (
            eta
            * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
            * torch.sqrt(1 - alpha_bar / alpha_bar_prev)
        )

        noise = torch.randn_like(x)
        mean_pred = (
            out["pred_xstart"] * torch.sqrt(alpha_bar_prev)
            + torch.sqrt(1 - alpha_bar_prev - sigma ** 2) * eps
        )
        nonzero_mask = (
            (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))
        )  # no noise when t == 0
        sample = mean_pred + nonzero_mask * sigma * noise

        return sample

    
    @torch.no_grad()
    def gaussian_ddim_sample(
        self,
        noise,
        T,
        eta=0.0
    ):
        x = noise
        b = x.shape[0]
        device = x.device
        for t in reversed(range(T)):
            print(f'Sample timestep {t:4d}', end='\r')
            t_array = (torch.ones(b, device=device) * t).long()
            out_num = self._denoise_fn(x, t_array)
            x = self.gaussian_ddim_step(
                out_num,
                x,
                t_array
            )
        print()
        return x
    





    def bit_to_log_onehot(self,z_cat_bits, num_classes, num_bits_per_cat_feature):
        """
        Convert bit representation to log one-hot representation.
        
        Args:
            z_cat_bits: Tensor of shape (batch_size, total_bits) containing bit representations
            num_classes: List of number of classes for each categorical feature
            num_bits_per_cat_feature: List of number of bits for each categorical feature
        
        Returns:
            log_onehot: Tensor of shape (batch_size, total_categories) in log one-hot format
        """
        batch_size = z_cat_bits.shape[0]
        device = z_cat_bits.device
        
        log_oneshots = []
        bit_idx = 0
        
        for i, (n_classes, n_bits) in enumerate(zip(num_classes, num_bits_per_cat_feature)):
            # Extract bits for this categorical feature
            feature_bits = z_cat_bits[:, bit_idx:bit_idx + n_bits]
            
            # Convert bits to integers (treating as binary representation)
            # Apply sigmoid to get probabilities, then convert to categorical distribution
            probs = torch.sigmoid(feature_bits)
            
            # Method 1: Use the bits to create a soft categorical distribution
            # We can interpret the bits as logits for different classes
            if n_bits == n_classes:
                # Direct mapping: each bit represents one class
                logits = feature_bits
            else:
                # Need to map n_bits to n_classes
                # Simple approach: use a linear combination or repeat/truncate
                if n_bits < n_classes:
                    # Repeat the bits to match number of classes
                    repeat_factor = n_classes // n_bits
                    remainder = n_classes % n_bits
                    logits = feature_bits.repeat(1, repeat_factor)
                    if remainder > 0:
                        logits = torch.cat([logits, feature_bits[:, :remainder]], dim=1)
                else:
                    # Truncate or average
                    logits = feature_bits[:, :n_classes]
            
            # Convert to log probabilities
            log_prob = F.log_softmax(logits, dim=1)
            log_oneshots.append(log_prob)
            
            bit_idx += n_bits
        
        return torch.cat(log_oneshots, dim=1)


    def log_onehot_to_bit(self,log_onehot, num_classes, num_bits_per_cat_feature):
        """
        Convert log one-hot representation back to bit representation.
        
        Args:
            log_onehot: Tensor of shape (batch_size, total_categories) in log one-hot format
            num_classes: List of number of classes for each categorical feature
            num_bits_per_cat_feature: List of number of bits for each categorical feature
        
        Returns:
            z_cat_bits: Tensor of shape (batch_size, total_bits) containing bit representations
        """
        batch_size = log_onehot.shape[0]
        device = log_onehot.device
        
        bit_representations = []
        cat_idx = 0
        
        for i, (n_classes, n_bits) in enumerate(zip(num_classes, num_bits_per_cat_feature)):
            # Extract log probabilities for this categorical feature
            feature_log_probs = log_onehot[:, cat_idx:cat_idx + n_classes]
            
            # Convert log probabilities to probabilities
            probs = torch.exp(feature_log_probs)
            
            # Convert categorical distribution back to bits
            if n_bits == n_classes:
                # Direct mapping: each class probability becomes a bit
                bits = torch.logit(probs.clamp(1e-7, 1-1e-7))  # Convert probs to logits
            else:
                # Need to map n_classes to n_bits
                if n_bits < n_classes:
                    # Average or select top bits
                    bits = probs[:, :n_bits]
                    bits = torch.logit(bits.clamp(1e-7, 1-1e-7))
                else:
                    # Expand: repeat the probabilities
                    repeat_factor = n_bits // n_classes
                    remainder = n_bits % n_classes
                    expanded_probs = probs.repeat(1, repeat_factor)
                    if remainder > 0:
                        expanded_probs = torch.cat([expanded_probs, probs[:, :remainder]], dim=1)
                    bits = torch.logit(expanded_probs.clamp(1e-7, 1-1e-7))
            
            bit_representations.append(bits)
            cat_idx += n_classes
        
        return torch.cat(bit_representations, dim=1)



    # Alternative simpler versions if the above is too complex:

    def simple_bit_to_log_onehot(self,z_cat_bits, num_classes, num_bits_per_cat_feature):
        """
        Simplified version: treat bits as direct logits for categorical distribution
        """
        batch_size = z_cat_bits.shape[0]
        log_oneshots = []
        bit_idx = 0
        
        for n_classes, n_bits in zip(num_classes, num_bits_per_cat_feature):
            feature_bits = z_cat_bits[:, bit_idx:bit_idx + n_bits]
            
            # Pad or truncate to match number of classes
            if n_bits < n_classes:
                # Pad with zeros
                padding = torch.zeros(batch_size, n_classes - n_bits, device=z_cat_bits.device)
                logits = torch.cat([feature_bits, padding], dim=1)
            else:
                # Truncate
                logits = feature_bits[:, :n_classes]
            
            log_prob = F.log_softmax(logits, dim=1)
            log_oneshots.append(log_prob)
            bit_idx += n_bits
        
        return torch.cat(log_oneshots, dim=1)


    def simple_log_onehot_to_bit(self,log_onehot, num_classes, num_bits_per_cat_feature):
        """
        Simplified version: convert log probabilities back to logits (bits)
        """
        bit_representations = []
        cat_idx = 0
        
        for n_classes, n_bits in zip(num_classes, num_bits_per_cat_feature):
            feature_log_probs = log_onehot[:, cat_idx:cat_idx + n_classes]
            
            # Convert back to logits (remove softmax)
            # Since log_softmax(x) = x - log(sum(exp(x))), we can approximate the reverse
            logits = feature_log_probs  # This is an approximation
            
            # Pad or truncate to match number of bits
            if n_classes < n_bits:
                # Pad with zeros
                padding = torch.zeros(logits.shape[0], n_bits - n_classes, device=logits.device)
                bits = torch.cat([logits, padding], dim=1)
            else:
                # Truncate
                bits = logits[:, :n_bits]
            
            bit_representations.append(bits)
            cat_idx += n_classes
        
        return torch.cat(bit_representations, dim=1)


    @torch.no_grad()
    def gaussian_ddim_reverse_step(
        self,
        model_out_num,
        x,
        t,
        clip_denoised=False,
        eta=0.0
    ):
        assert eta == 0.0, "Eta must be zero."
        out = self.gaussian_p_mean_variance(
            model_out_num,
            x,
            t,
            clip_denoised=clip_denoised,
            denoised_fn=None,
            model_kwargs=None,
        )

        eps = (
            extract(self.sqrt_recip_alphas_cumprod, t, x.shape) * x
            - out["pred_xstart"]
        ) / extract(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
        alpha_bar_next = extract(self.alphas_cumprod_next, t, x.shape)

        mean_pred = (
            out["pred_xstart"] * torch.sqrt(alpha_bar_next)
            + torch.sqrt(1 - alpha_bar_next) * eps
        )

        return mean_pred

    @torch.no_grad()
    def gaussian_ddim_reverse_sample(
        self,
        x,
        T
    ):
        b = x.shape[0]
        device = x.device
        for t in range(T):
            print(f'Reverse timestep {t:4d}', end='\r')
            t_array = (torch.ones(b, device=device) * t).long()
            out_num = self._denoise_fn(x, t_array)
            x = self.gaussian_ddim_reverse_step(
                out_num,
                x,
                t_array,
                eta=0.0
            )
        print()

        return x


    @torch.no_grad()
    def multinomial_ddim_step(
        self,
        model_out_cat,
        log_x_t,
        t,
        t_prev,
        eta=1.0
    ):
        # not ddim, essentially
        log_x0 = self.predict_start(model_out_cat, log_x_t=log_x_t, t=t)

        alpha_bar = extract(self.alphas_cumprod, t, log_x_t.shape)

        if t[0] != 0:
            alpha_bar_prev = extract(self.alphas_cumprod, t_prev, log_x_t.shape)
        else:
            alpha_bar_prev = extract(self.alphas_cumprod_prev, t_prev, log_x_t.shape)
        
        sigma = (
            eta
            * torch.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar))
            * torch.sqrt(1 - alpha_bar / alpha_bar_prev)
        )

        coef1 = sigma
        coef2 = alpha_bar_prev - sigma * alpha_bar
        coef3 = 1 - coef1 - coef2


        log_ps = torch.stack([
            torch.log(coef1) + log_x_t,
            torch.log(coef2) + log_x0,
            torch.log(coef3) - torch.log(self.num_classes_expanded)
        ], dim=2) 

        log_prob = torch.logsumexp(log_ps, dim=2)

        out = self.log_sample_categorical(log_prob)

        return out
    

    @torch.no_grad()
    def sample_ddim(self, num_samples, steps=1000):
        b = num_samples
        device = self.log_alpha.device
        
        # Initialize with correct dimensions
        z_norm = torch.randn((b, self.num_numerical_features + sum(self.num_bits_per_cat_feature)), 
                            device=device)
        
        # Create the timestep schedule
        interval = self.num_timesteps // steps
        timesteps = list(np.arange(self.num_timesteps-1, -1, -interval))
        if timesteps[-1] != 0:
            timesteps.append(0)
        
        with tqdm(total=len(timesteps), desc="DDIM Sampling", unit="step") as pbar:
            for i in range(len(timesteps)):
        
                t = torch.full((b,), timesteps[i], device=device, dtype=torch.long)
                t_prev = torch.full((b,), timesteps[i+1] if i < len(timesteps)-1 else 0, 
                                device=device, dtype=torch.long)
                
                # Denoise the complete input
                model_out = self._denoise_fn(z_norm.float(), t)
                
                # Split into numerical and categorical parts
                model_out_num = model_out[:, :self.num_numerical_features]
                model_out_cat = model_out[:, self.num_numerical_features:]
                
                # Process numerical features with DDIM
                z_num = z_norm[:, :self.num_numerical_features]
                z_num = self.gaussian_ddim_step(
                    model_out_num,
                    z_num,
                    t,
                    t_prev
                )
                
                # Process categorical features
                if sum(self.num_bits_per_cat_feature) > 0:
                    z_cat = z_norm[:, self.num_numerical_features:]

                    # print(model_out_cat)


                    
                    # # # Option 1: Treat categorical bits as continuous (simpler)
                    z_cat = self.gaussian_ddim_step(
                        model_out_cat,
                        z_cat,
                        t,
                        t_prev
                    )

                 


                    
                    # # Option 2: Use the multinomial DDIM step (more correct but needs adaptation)
                    # # First convert to log one-hot representation
                    # log_z_cat = self.simbit_to_log_onehot(z_cat, self.num_classes, self.num_bits_per_cat_feature)
                    # log_z_cat = self.multinomial_ddim_step(model_out_cat, log_z_cat, t, t_prev)
                    # z_cat = self.log_onehot_to_bit(log_z_cat, self.num_classes, self.num_bits_per_cat_feature)
                    
                    z_norm = torch.cat([z_num, z_cat], dim=1)
                else:
                    z_norm = z_num
            
            # Final processing
            z_num = z_norm[:, :self.num_numerical_features]
            if sum(self.num_bits_per_cat_feature) > 0:
                z_cat = z_norm[:, self.num_numerical_features:]
                z_cat = psk2cat(z_cat, self.num_classes)
                return torch.cat([z_num, z_cat], dim=1).cpu()
            else:
                return z_num.cpu()





    # @torch.no_grad()
    # def sample_ddim(self, num_samples, steps = 1000):
    #     # b = num_samples
    #     # device = self.log_alpha.device
    #     # z_norm = torch.randn((b, self.num_numerical_features), device=device)

    #     b = num_samples
    #     device = self.log_alpha.device
    #     # Use the same representation as sample()
    #     z_norm = torch.randn((b, self.num_numerical_features + sum(self.num_bits_per_cat_feature)), 
    #                         device=device)

    #     has_cat = self.num_classes[0] != 0
    #     log_z = torch.zeros((b, 0), device=device).float()
    #     if has_cat:
    #         uniform_logits = torch.zeros((b, len(self.num_classes_expanded)), device=device)
    #         log_z = self.log_sample_categorical(uniform_logits)
        
    #     interval = 1000 // steps
    #     timesteps = list(np.arange(999, -1, -interval))

    #     if timesteps[-1] != 0:
    #         timesteps.append(0)
        
    #     for i in range(0, len(timesteps)):

    #         print(f'Sample timestep {i:4d}', end='\r')
            
    #         t = torch.full((b,), timesteps[i], device=device, dtype=torch.long)
            
           
    #         if i != len(timesteps) -1 :
    #             t_prev = torch.full((b,), timesteps[i+1], device=device, dtype=torch.long)
    #         else:
    #             t_prev = torch.full((b,), 0, device=device, dtype=torch.long)
                
    #         model_out = self._denoise_fn(
    #             torch.cat([z_norm, log_z], dim=1).float(),
    #             t
    #         )
    #         model_out_num = model_out[:, :self.num_numerical_features]
    #         model_out_cat = model_out[:, self.num_numerical_features:]
    #         z_norm = self.gaussian_ddim_step(model_out_num, z_norm, t, t_prev, clip_denoised=False)
    #         if has_cat:
    #             log_z = self.multinomial_ddim_step(model_out_cat, log_z, t, t_prev)

    #     print()
    #     z_ohe = torch.exp(log_z).round()
    #     z_cat = log_z
    #     if has_cat:
    #         z_cat = ohe_to_categories(z_ohe, self.num_classes)
    #     sample = torch.cat([z_norm, z_cat], dim=1).cpu()
    #     return sample


    @torch.no_grad()
    def sample(self, num_samples):
        b = num_samples
        device = self.log_alpha.device
        z_norm = torch.randn((b, self.num_numerical_features + sum(self.num_bits_per_cat_feature)), dtype=torch.float32, device=device)

        for i in reversed(range(0, self.num_timesteps)):
            print(f'Sample timestep {i:4d}', end='\r')
            t = torch.full((b,), i, device=device, dtype=torch.long)
            model_out = self._denoise_fn(
                z_norm,
                t
            )

            z_norm = self.gaussian_p_sample(model_out, z_norm, t, clip_denoised=False)['sample']

        z_num = z_norm[:, :self.num_numerical_features]
        z_cat_transformed = z_norm[:, self.num_numerical_features:]
        z_cat = psk2cat(z_cat_transformed, self.num_classes)
        sample = torch.cat([z_num, z_cat], dim=1).cpu()
        return sample
    
    def sample_all(self, num_samples, batch_size, ddim=False, steps = 1000):
        if ddim:
            print('Sample using DDIM.')
            sample_fn = self.sample_ddim
        else:
            sample_fn = self.sample
        
        b = batch_size

        all_samples = []
        num_generated = 0
        while num_generated < num_samples:
            if not ddim:
                sample  = sample_fn(b)
            else:
                sample = sample_fn(b, steps=steps)
            mask_nan = torch.any(sample.isnan(), dim=1)
            sample = sample[~mask_nan]

            all_samples.append(sample)
      
            if sample.shape[0] != b:
                raise FoundNANsError
            num_generated += sample.shape[0]

        x_gen = torch.cat(all_samples, dim=0)[:num_samples]

        return x_gen