import os
import cv2
import pywt
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms

import logging
logger = logging.getLogger(__name__)

from metrics.base_metrics_class import calculate_metrics_for_train
from detectors import DETECTOR
from networks import BACKBONE
from loss import LOSSFUNC

from .base_detector import AbstractDetector

# -------------
# Wavelet utility
# -------------
def wavelet_denoise(image: np.ndarray,
                    wavelet='db1',
                    level=3,
                    threshold_fraction=1.0) -> np.ndarray:
    """
    Perform wavelet 'denoising' by zeroing out some detail coefficients:
      - threshold_fraction <= 0.0: Keep ALL detail (no denoise).
      - threshold_fraction >= 1.0: Remove ALL detail.
      - 0 < threshold_fraction < 1: Zero out the smallest fraction of detail.
    """
    # 1) Decompose
    coeffs = pywt.wavedec2(image, wavelet, level=level)
    cA = coeffs[0]  # Low-frequency approximation
    new_coeffs = [cA]

    # 2) Threshold details
    for detail_level in coeffs[1:]:
        cH, cV, cD = detail_level
        for c in (cH, cV, cD):
            if threshold_fraction <= 0.0:
                # Keep all details
                continue
            elif threshold_fraction >= 1.0:
                # Remove all details
                c[:] = 0
            else:
                # PARTIAL denoise => zero out the smallest fraction
                flat = c.flatten()
                sorted_abs = np.sort(np.abs(flat))
                cutoff_index = int(len(sorted_abs) * threshold_fraction)
                cutoff_index = max(0, min(cutoff_index, len(sorted_abs) - 1))
                cutoff_value = sorted_abs[cutoff_index]
                c[np.abs(c) < cutoff_value] = 0
        new_coeffs.append((cH, cV, cD))

    # 3) Reconstruct
    return pywt.waverec2(new_coeffs, wavelet)

class WaveletDenoiseExtractor(nn.Module):
    """
    Converts an RGB image into a single-channel grayscale image and applies
    wavelet partial denoising. Output scaled to [-1,1].
    """
    def __init__(self, wavelet='db1', level=3, threshold_fraction=1.0):
        super().__init__()
        self.wavelet = wavelet
        self.level = level
        self.threshold_fraction = threshold_fraction

    def forward(self, image_tensor: torch.Tensor) -> torch.Tensor:
        """
        image_tensor: [B,3,H,W] in range [-1,1].
        Returns: wavelet-denoised grayscale [B,1,H,W] in range [-1,1].
        """
        # 1) Convert (B,3,H,W) to (B,1,H,W)
        gray_tensor = torch.mean(image_tensor, dim=1, keepdim=True)

        # 2) Move to CPU NumPy
        gray_np = gray_tensor.cpu().numpy()  # shape (B,1,H,W)

        denoised_list = []
        for i in range(gray_np.shape[0]):
            gray_2d = gray_np[i, 0, :, :]

            # Convert from [-1,1] -> [0..255]
            gray_255 = ((gray_2d + 1) * 127.5).clip(0, 255).astype(np.uint8)
            gray_float = gray_255.astype(np.float32)

            # PARTIAL zeroing of detail in wavelet domain
            denoised_2d = wavelet_denoise(
                gray_float,
                wavelet=self.wavelet,
                level=self.level,
                threshold_fraction=self.threshold_fraction
            )
            denoised_2d = np.clip(denoised_2d, 0, 255)
            denoised_list.append(denoised_2d)

        # 3) Stack => (B,H,W) => (B,1,H,W)
        denoised_array = np.stack(denoised_list, axis=0).astype(np.float32)
        wavelet_tensor = torch.from_numpy(denoised_array).unsqueeze(1)

        # 4) Scale back to [-1,1]
        wavelet_tensor = wavelet_tensor / 127.5 - 1.0
        return wavelet_tensor.to(image_tensor.device)

@DETECTOR.register_module(module_name='wavelet_phase_efficient')
class WaveletPhase_efficient_Detector(AbstractDetector):
    """
    Detector that uses:
      - 3-channel RGB
      - A fused channel of wavelet-denoised grayscale + phase-only grayscale
      => total of 4 channels for EfficientNet-B4 backbone.
    """
    def __init__(self, config):
        super().__init__()
        self.config = config

        # Build the 4-channel EfficientNet-B4 backbone
        self.backbone = self.build_backbone(config)

        # Wavelet extractor
        self.wavelet_extractor = WaveletDenoiseExtractor(
            wavelet='db1',
            level=3,
            threshold_fraction=1.0
        )

        # Loss function
        self.loss_func = self.build_loss(config)

        # Fusion block: merges wavelet + phase => 1 channel
        # (Input shape: [B,2,H,W], output shape: [B,1,H,W])
        self.fusion_block = nn.Sequential(
            nn.Conv2d(2, 1, kernel_size=1, bias=False),
            nn.BatchNorm2d(1),
            nn.ReLU(inplace=True)
        )

        # Optionally load & freeze the fusion block weights from config
        fusion_pretrained = config.get('fusion_pretrained', None)
        self.load_and_freeze_fusion_block_weights(fusion_pretrained)

        # Recorders for inference
        self.prob = []
        self.label = []
        self.correct = 0
        self.total = 0

    def build_backbone(self, config):
        """
        Build an EfficientNet-B4 backbone that can handle 4 input channels:
        3-ch RGB + 1-ch fused wavelet+phase.
        """
        # Look up the class from your registry. 
        # You presumably have something like `EfficientNetB4` in networks/BACKBONE
        backbone_class = BACKBONE[config['backbone_name']]
        model_config = config['backbone_config']
        backbone = backbone_class(model_config)

        # Load the pretrained state_dict for the backbone
        state_dict = torch.load(config['pretrained'], map_location='cpu')

        # If your pretrained checkpoint includes 2D pointwise conv weights
        # stored as [out_ch, in_ch] with no spatial dims, reshape them:
        for name, weights in list(state_dict.items()):
            if 'pointwise' in name and weights.ndim == 2:
                state_dict[name] = weights.unsqueeze(-1).unsqueeze(-1)

        # Remove final classifier if present
        state_dict = {
            k: v for k, v in state_dict.items()
            if 'fc' not in k and 'classifier' not in k
        }

        # We typically expect `_conv_stem.weight` in EfficientNet-B4
        conv_stem_key = '_conv_stem.weight'
        if conv_stem_key in state_dict:
            conv_stem_data = state_dict.pop(conv_stem_key)
        else:
            conv_stem_data = None

        # Load partial weights
        missing, unexpected = backbone.load_state_dict(state_dict, strict=False)
        logger.info(
            f"Loaded pretrained model from {config['pretrained']}. "
            f"Missing: {missing}, Unexpected: {unexpected}"
        )

        # Replace the default conv stem to handle 4 channels instead of 3
        # Typically, B4 has 48 out_channels in the stem
        backbone.efficientnet._conv_stem = nn.Conv2d(
            in_channels=4,
            out_channels=48,
            kernel_size=3,
            stride=2,
            padding=0,
            bias=False
        )

        # If we actually found pretrained 3-channel weights in the old checkpoint
        if conv_stem_data is not None:
            with torch.no_grad():
                # conv_stem_data shape: [48,3,3,3] for B4
                avg_data = conv_stem_data.mean(dim=1, keepdim=True)  # becomes [48,1,3,3]
                new_data = avg_data.repeat(1, 4, 1, 1)               # becomes [48,4,3,3]
                backbone.efficientnet._conv_stem.weight.data = new_data

        logger.info("Modified _conv_stem to accept 4 input channels (RGB + fused wavelet+phase).")
        return backbone

    def load_and_freeze_fusion_block_weights(self, fusion_pretrained_path):
        """
        Loads only the fusion_block weights from a checkpoint
        and freezes them.
        """
        if fusion_pretrained_path is None:
            logger.info("No 'fusion_pretrained' specified; skipping fusion_block loading.")
            return

        logger.info(f"Loading fusion_block weights from: {fusion_pretrained_path}")
        checkpoint = torch.load(fusion_pretrained_path, map_location='cpu')

        # Some checkpoints have a "state_dict" key, others are direct
        state_dict = checkpoint.get('state_dict', checkpoint)

        # Extract only keys that start with 'fusion_block.'
        fusion_state_dict = {}
        for k, v in state_dict.items():
            if k.startswith('fusion_block.'):
                # e.g. "fusion_block.0.weight" -> "0.weight"
                new_key = k.replace('fusion_block.', '')
                fusion_state_dict[new_key] = v

        missing, unexpected = self.fusion_block.load_state_dict(fusion_state_dict, strict=False)
        logger.info(f"Loaded fusion_block. Missing: {missing}, Unexpected: {unexpected}")

        # Freeze the fusion_block so it's not updated during training
        for param in self.fusion_block.parameters():
            param.requires_grad = False

    def build_loss(self, config):
        """
        Build the specified loss function.
        """
        loss_class = LOSSFUNC[config['loss_func']]
        return loss_class()

    def phase_without_amplitude(self, img: torch.Tensor) -> torch.Tensor:
        """
        Extract the phase-only image:
        1) Convert to grayscale => (B,1,H,W)
        2) FFT to obtain frequency representation
        3) Extract phase => angle
        4) Rebuild frequency representation with magnitude=1 => e^{j * phase}
        5) iFFT => purely phase-based image
        """
        # Convert to grayscale
        gray_img = torch.mean(img, dim=1, keepdim=True)  # (B,1,H,W)

        # FFT
        X = torch.fft.fftn(gray_img, dim=(-2, -1))

        # Extract phase
        phase_spectrum = torch.angle(X)

        # Magnitude = 1 => e^{j*phase}
        reconstructed_X = torch.exp(1j * phase_spectrum)

        # iFFT => purely phase-based
        reconstructed_x = torch.fft.ifftn(reconstructed_X, dim=(-2, -1))
        phase_image = torch.real(reconstructed_x)
        return phase_image

    def features(self, data_dict: dict) -> torch.Tensor:
        """
        Steps:
         1) Original RGB: (B,3,H,W)
         2) Wavelet-denoised grayscale: (B,1,H,W)
         3) Phase-only grayscale: (B,1,H,W)
         4) Fuse wavelet + phase => (B,1,H,W)
         5) Concatenate with RGB => (B,4,H,W)
         6) Pass to backbone.features()
        """
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        x_rgb = data_dict['image'].to(device)  # (B,3,H,W)

        # (B,1,H,W) wavelet
        x_wavelet = self.wavelet_extractor(x_rgb)

        # (B,1,H,W) phase
        x_phase = self.phase_without_amplitude(x_rgb)

        # Fuse wavelet + phase => single channel
        wavelet_phase = torch.cat([x_wavelet, x_phase], dim=1)  # (B,2,H,W)
        fused_channel = self.fusion_block(wavelet_phase)        # (B,1,H,W)

        # Combine fused channel with RGB => 4 channels
        x_4ch = torch.cat([x_rgb, fused_channel], dim=1)        # (B,4,H,W)

        # Extract features from the backbone
        feats = self.backbone.features(x_4ch)
        return feats

    def classifier(self, features: torch.Tensor) -> torch.Tensor:
        """
        Pass features through the backbone's classifier head.
        """
        return self.backbone.classifier(features)

    def get_losses(self, data_dict: dict, pred_dict: dict) -> dict:
        """
        Compute and return the loss dict.
        """
        label = data_dict['label']
        pred = pred_dict['cls']
        loss = self.loss_func(pred, label)
        return {'overall': loss}

    def get_train_metrics(self, data_dict: dict, pred_dict: dict) -> dict:
        """
        Compute batch metrics for training.
        """
        label = data_dict['label']
        pred = pred_dict['cls']
        auc, eer, acc, ap = calculate_metrics_for_train(
            label.detach(),
            pred.detach()
        )
        return {'acc': acc, 'auc': auc, 'eer': eer, 'ap': ap}

    def forward(self, data_dict: dict, inference=False) -> dict:
        """
        Forward pass:
         1) Extract wavelet+phase fused features.
         2) Classify => logits.
         3) Compute probabilities and (optionally) record for inference.
        """
        feats = self.features(data_dict)
        pred = self.classifier(feats)
        prob = torch.softmax(pred, dim=1)[:, 1]

        pred_dict = {
            'cls': pred,   # logits
            'prob': prob,  # scalar prob for the "positive" class
            'feat': feats
        }

        if inference:
            # Collect probabilities and labels
            self.prob.append(prob.detach().cpu().numpy())
            self.label.append(data_dict['label'].detach().cpu().numpy())

            # Track accuracy
            _, prediction_class = torch.max(pred, 1)
            correct = (prediction_class == data_dict['label']).sum().item()
            self.correct += correct
            self.total += data_dict['label'].size(0)

        return pred_dict
