import os
import pickle
import random
import tarfile
import urllib.request

import numpy as np
import pandas as pd
import torch
import torchvision
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, TensorDataset
from PIL import Image
import io

# np.random.seed(0)
# random.seed(0)
# torch.random.manual_seed(0)
# torch.manual_seed(0)
# torch.cuda.manual_seed_all(0)


def train_val_split(labels, n_labeled, positive_label_list):
    labels = np.array(labels)
    label_types = np.unique(labels)
    train_labeled_idxs = []
    train_unlabeled_idxs = []
    n_labeled_per_class = int(n_labeled / len(positive_label_list))

    num_all = len(labels)
    num_positive = 0

    for i in label_types:
        idxs = np.where(labels == i)[0]
        np.random.shuffle(idxs)

        train_unlabeled_idxs.extend(idxs)
        if i in positive_label_list:
            train_labeled_idxs.extend(idxs[:n_labeled_per_class])
            num_positive += len(idxs)
        #     train_unlabeled_idxs.extend(idxs[n_labeled_per_class:])
        # else:
        #     train_unlabeled_idxs.extend(idxs)

    np.random.shuffle(train_labeled_idxs)
    np.random.shuffle(train_unlabeled_idxs)

    prior = num_positive / num_all

    return train_labeled_idxs, train_unlabeled_idxs, prior


def normalise_fashionmnist(x, mean, std):
    x, mean, std = [np.array(a, np.float32) for a in (x, mean, std)]
    x -= mean
    x /= std
    return x


def _3D_to_4(x):
    '''
    :param x: For mnist, it is a tensor of shape (len, 28, 28)
    :return: a tensor of shape (len, 1, 28, 28)
    '''
    return x.reshape(x.shape[0], 1, x.shape[1], x.shape[2])


def normalise(x, mean, std):
    x, mean, std = [np.array(a, np.float32) for a in (x, mean, std)]
    x -= mean * 255
    x *= 1.0 / (255 * std)
    return x


def transpose(x, source='NHWC', target='NCHW'):
    '''
    N: batch size
    H: height
    W: weight
    C: channel
    '''
    return x.transpose([source.index(d) for d in target])


class FashionMNIST_labeled(torchvision.datasets.FashionMNIST):
    def __init__(self,
                 root,
                 indexs=None,
                 train=True,
                 transform=None,
                 target_transform=None,
                 download=True):
        super(FashionMNIST_labeled,
              self).__init__(root,
                             train=train,
                             transform=transform,
                             target_transform=target_transform,
                             download=download)
        if indexs is not None:
            self.data = self.data[indexs]
            self.targets = np.array(self.targets)[indexs]
        self.data = _3D_to_4(
            normalise_fashionmnist(self.data, mean=(0.2860, ), std=(0.3530, )))

    def __getitem__(self, index):
        img, target = self.data[index], self.targets[index]

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target


class FashionMNIST_unlabeled(FashionMNIST_labeled):
    def __init__(self,
                 root,
                 indexs,
                 train=True,
                 transform=None,
                 target_transform=None,
                 download=True):
        super(FashionMNIST_unlabeled,
              self).__init__(root,
                             indexs,
                             train=train,
                             transform=transform,
                             target_transform=target_transform,
                             download=download)
        # self.targets = np.array([-1 for i in range(len(self.targets))])


def get_fashionMNIST_data(num_labeled,
                          positive_label_list,
                          root,
                          transform_train=None,
                          transform_val=None):
    base_dataset = torchvision.datasets.FashionMNIST(root,
                                                     train=True,
                                                     download=True)
    train_labeled_idxs, train_unlabeled_idxs, prior = train_val_split(
        base_dataset.targets, num_labeled, positive_label_list)
    target_transform = lambda x: 1 if x in positive_label_list else -1

    train_labeled_dataset = FashionMNIST_labeled(
        root,
        train_labeled_idxs,
        train=True,
        transform=transform_train,
        target_transform=target_transform)
    train_unlabeled_dataset = FashionMNIST_unlabeled(
        root,
        train_unlabeled_idxs,
        train=True,
        transform=transform_train,
        target_transform=target_transform)
    val_dataset = FashionMNIST_labeled(root,
                                       train=True,
                                       transform=transform_train,
                                       target_transform=target_transform)
    test_dataset = FashionMNIST_labeled(root,
                                        train=False,
                                        transform=transform_val,
                                        download=True,
                                        target_transform=target_transform)

    return train_labeled_dataset, train_unlabeled_dataset, val_dataset, test_dataset, prior


class CIFAR10_labeled(torchvision.datasets.CIFAR10):
    def __init__(self,
                 root,
                 indexs=None,
                 train=True,
                 transform=None,
                 target_transform=None,
                 download=False):
        super(CIFAR10_labeled,
              self).__init__(root,
                             train=train,
                             transform=transform,
                             target_transform=target_transform,
                             download=download)
        if indexs is not None:
            self.data = self.data[indexs]
            self.targets = np.array(self.targets)[indexs]
        self.data = transpose(
            normalise(self.data, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))

    def __getitem__(self, index):
        img, target = self.data[index], self.targets[index]

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target


class CIFAR10_unlabeled(CIFAR10_labeled):
    def __init__(self,
                 root,
                 indexs,
                 train=True,
                 transform=None,
                 target_transform=None,
                 download=False):
        super(CIFAR10_unlabeled,
              self).__init__(root,
                             indexs,
                             train=train,
                             transform=transform,
                             target_transform=target_transform,
                             download=download)
        # self.targets = np.array([-1 for i in range(len(self.targets))])


def get_cifar10_data(num_labeled,
                     positive_label_list,
                     root,
                     transform_train=None,
                     transform_val=None):
    base_dataset = torchvision.datasets.CIFAR10(root,
                                                train=True,
                                                download=True)
    train_labeled_idxs, train_unlabeled_idxs, prior = train_val_split(
        base_dataset.targets, num_labeled, positive_label_list)
    target_transform = lambda x: 1 if x in positive_label_list else -1

    train_labeled_dataset = CIFAR10_labeled(root,
                                            train_labeled_idxs,
                                            train=True,
                                            transform=transform_train,
                                            target_transform=target_transform)
    train_unlabeled_dataset = CIFAR10_unlabeled(
        root,
        train_unlabeled_idxs,
        train=True,
        transform=transform_train,
        target_transform=target_transform)
    val_dataset = CIFAR10_labeled(root,
                                  train=True,
                                  transform=transform_train,
                                  target_transform=target_transform)
    test_dataset = CIFAR10_labeled(root,
                                   train=False,
                                   transform=transform_val,
                                   download=True,
                                   target_transform=target_transform)

    return train_labeled_dataset, train_unlabeled_dataset, val_dataset, test_dataset, prior


class STL10_labeled(torchvision.datasets.STL10):
    def __init__(self,
                 root,
                 indexs=None,
                 split='train+unlabeled',
                 transform=None,
                 target_transform=None,
                 download=False):
        super(STL10_labeled, self).__init__(root,
                                            split=split,
                                            transform=transform,
                                            target_transform=target_transform,
                                            download=download)
        if indexs is not None:
            self.data = self.data[indexs]
            self.labels = np.array(self.labels)[indexs]
        self.data = transpose(self.data, source='NCHW', target='NHWC')
        self.data = transpose(
            normalise(self.data, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))

    def __getitem__(self, index):
        img, target = self.data[index], self.labels[index]

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target


class STL10_unlabeled(STL10_labeled):
    def __init__(self,
                 root,
                 indexs,
                 split='train+unlabeled',
                 transform=None,
                 target_transform=None,
                 download=False):
        super(STL10_unlabeled,
              self).__init__(root,
                             indexs,
                             split=split,
                             transform=transform,
                             target_transform=target_transform,
                             download=download)
        # self.labels = np.array([-1 for i in range(len(self.labels))])


def get_stl10_data(num_labeled,
                   positive_label_list,
                   root,
                   transform_train=None,
                   transform_val=None):
    base_dataset = torchvision.datasets.STL10(root,
                                              split='train+unlabeled',
                                              download=True)
    train_labeled_idxs, train_unlabeled_idxs, prior = train_val_split(
        base_dataset.labels, num_labeled, positive_label_list)
    target_transform = lambda x: 1 if x in positive_label_list else -1

    train_labeled_dataset = STL10_labeled(root,
                                          train_labeled_idxs,
                                          split='train+unlabeled',
                                          transform=transform_train,
                                          target_transform=target_transform)
    train_unlabeled_dataset = STL10_unlabeled(
        root,
        train_unlabeled_idxs,
        split='train+unlabeled',
        transform=transform_train,
        target_transform=target_transform)
    val_dataset = STL10_labeled(root,
                                split='train',
                                transform=transform_train,
                                target_transform=target_transform)
    test_dataset = STL10_labeled(root,
                                 split='test',
                                 transform=transform_val,
                                 download=True,
                                 target_transform=target_transform)

    return train_labeled_dataset, train_unlabeled_dataset, val_dataset, test_dataset, prior


class Alzheimer_labeled(Dataset):
    def __init__(self,
                 root,
                 indexs=None,
                 train=True,
                 transform=None,
                 target_transform=None,
                 download=False):
        super(Alzheimer_labeled, self).__init__()
        self.root = os.path.expanduser(root)
        self.train = train
        self.transform = transform
        self.target_transform = target_transform

        # Load the appropriate parquet file based on train/test
        if self.train:
            data_file = os.path.join(self.root, 'alzheimer', 'train-00000-of-00001-c08a401c53fe5312.parquet')
        else:
            data_file = os.path.join(self.root, 'alzheimer', 'test-00000-of-00001-44110b9df98c5585.parquet')

        # Read parquet file
        df = pd.read_parquet(data_file)

        # Process images and convert to numpy arrays
        self.data = []
        for img_dict in df['image'].values:
            # Convert JPEG bytes to numpy array
            img_bytes = img_dict['bytes']
            img = Image.open(io.BytesIO(img_bytes))
            img_array = np.array(img)

            # Ensure image is RGB (convert if grayscale)
            if len(img_array.shape) == 2:  # grayscale
                img_array = np.stack((img_array,) * 3, axis=-1)

            self.data.append(img_array)

        self.data = np.array(self.data)
        self.targets = df['label'].values

        if indexs is not None:
            self.data = self.data[indexs]
            self.targets = self.targets[indexs]

        # Normalize the data
        if len(self.data) > 0:
            # Convert to float32 and normalize
            self.data = self.data.astype(np.float32)
            mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
            std = np.array([0.229, 0.224, 0.225], dtype=np.float32)

            # Normalize each channel
            for i in range(3):
                self.data[:, :, :, i] = (self.data[:, :, :, i] / 255.0 - mean[i]) / std[i]

            # Convert from NHWC to NCHW format
            self.data = np.transpose(self.data, (0, 3, 1, 2))

    def __getitem__(self, index):
        img, target = self.data[index], self.targets[index]

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target

    def __len__(self):
        return len(self.data)


class Alzheimer_unlabeled(Alzheimer_labeled):
    def __init__(self,
                 root,
                 indexs,
                 train=True,
                 transform=None,
                 target_transform=None,
                 download=True):
        super(Alzheimer_unlabeled, self).__init__(
            root,
            indexs,
            train=train,
            transform=transform,
            target_transform=target_transform,
            download=download)


def get_alzheimer_data(num_labeled,
                       positive_label_list,
                       root,
                       transform_train=None,
                       transform_val=None):

    base_dataset = Alzheimer_labeled(root, train=True, download=True)
    train_labeled_idxs, train_unlabeled_idxs, prior = train_val_split(
        base_dataset.targets, num_labeled, positive_label_list)

    target_transform = lambda x: 1 if x in positive_label_list else -1

    train_labeled_dataset = Alzheimer_labeled(
        root,
        train_labeled_idxs,
        train=True,
        transform=transform_train,
        target_transform=target_transform)

    train_unlabeled_dataset = Alzheimer_unlabeled(
        root,
        train_unlabeled_idxs,
        train=True,
        transform=transform_train,
        target_transform=target_transform)

    val_dataset = Alzheimer_labeled(
        root,
        train=True,
        transform=transform_train,
        target_transform=target_transform)

    test_dataset = Alzheimer_labeled(
        root,
        train=False,
        transform=transform_val,
        target_transform=target_transform)

    return train_labeled_dataset, train_unlabeled_dataset, val_dataset, test_dataset, prior


def get_loaders(train_labeled_dataset,
                train_unlabeled_dataset,
                val_dataset,
                test_dataset,
                batch_size=512):
    p_loader = DataLoader(dataset=train_labeled_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=True)
    x_loader = DataLoader(dataset=train_unlabeled_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          drop_last=True)
    train_loader = DataLoader(dataset=val_dataset,
                              batch_size=batch_size,
                              shuffle=True,
                              drop_last=True)
    val_loader = DataLoader(dataset=val_dataset,
                            batch_size=batch_size,
                            shuffle=False)
    test_loader = DataLoader(dataset=test_dataset,
                             batch_size=batch_size,
                             shuffle=False)

    return p_loader, x_loader, train_loader, val_loader, test_loader


def load_image_dataset(dataset_name,
                       num_labeled,
                       batchsize,
                       positive_label_list,
                       root='../data',
                       with_bias=False,
                       resample_model=""):
    print("==================")
    print("loading data...")
    if dataset_name == "cifar10":
        (train_labeled_dataset, train_unlabeled_dataset, val_dataset,
         test_dataset,
         prior) = get_cifar10_data(num_labeled=num_labeled,
                                   positive_label_list=positive_label_list,
                                   root=root)
    elif dataset_name == "fashionmnist":
        (train_labeled_dataset, train_unlabeled_dataset, val_dataset,
         test_dataset, prior) = get_fashionMNIST_data(
             num_labeled=num_labeled,
             positive_label_list=positive_label_list,
             root=root)
    elif dataset_name == "stl10":
        (train_labeled_dataset, train_unlabeled_dataset, val_dataset,
         test_dataset,
         prior) = get_stl10_data(num_labeled=num_labeled,
                                 positive_label_list=positive_label_list,
                                 root=root)
    elif dataset_name == "alzheimer":
        (train_labeled_dataset, train_unlabeled_dataset, val_dataset,
         test_dataset, prior) = get_alzheimer_data(
            num_labeled=num_labeled,
            positive_label_list=positive_label_list,
            root=root)
    else:
        raise ValueError("dataset name {} is unknown.".format(dataset_name))

    p_loader, x_loader, train_loader, val_loader, test_loader = get_loaders(
        train_labeled_dataset, train_unlabeled_dataset, val_dataset,
        test_dataset, batchsize)

    if dataset_name == "alzheimer":
        data_shape = train_labeled_dataset.data[0].shape
        dim = data_shape[0] * data_shape[1] * data_shape[2]
    else:
        dim = train_labeled_dataset.data.size / len(train_labeled_dataset.data)
    print("load data success!")
    print("==================")
    print('    # train data: ', len(x_loader.dataset))
    print('    # labeled train data: ', len(p_loader.dataset))
    print('    # val data: ', len(val_loader.dataset))
    print('    # test data: ', len(test_loader.dataset))
    print('    prior: ', prior)
    print('    dim: ', dim)

    return p_loader, x_loader, train_loader, val_loader, test_loader, train_labeled_dataset, dim, prior
