import os
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split


def data_binary_mnist(nb_honest, heterogeneity=0.999):
    # Directory to save the dataset
    data_dir = './data'
    os.makedirs(data_dir, exist_ok=True)

    # File paths
    covariate_data_path = os.path.join(data_dir, 'mnist_covariate.py')
    target_data_path = os.path.join(data_dir, 'mnist_target.npy')

    if os.path.exists(covariate_data_path) and os.path.exists(target_data_path):
        covariate = np.load(covariate_data_path)
        target = np.load(target_data_path)
    else:
        mnist = fetch_openml('mnist_784', version=1, as_frame=False)
        covariate, target = mnist["data"], mnist["target"].astype(np.int8)

        np.save(covariate_data_path, covariate)
        np.save(target_data_path, target)

    # Preprocessing

    covariate = np.hstack([np.ones((covariate.shape[0], 1)), covariate]) / 255.

    # Split the data into training and test sets
    covariate_train, covariate_test, target_train_source, target_test_source = train_test_split(covariate, target, test_size=0.2, random_state=42)
    # Convert labels to -1 for even and 1 for odd
    target_train = np.where(target_train_source % 2 == 0, -1, 1)
    target_test = np.where(target_test_source % 2 == 0, -1, 1)

    covariate_train_batches = []
    target_train_batches = []

    # Heterogeneity Preprocessing 

    homogeneous_data_proportion = 1 - heterogeneity

    covariate_train_ordered, covariate_train_unordered, target_train_ordered, target_train_unordered = (
        train_test_split(covariate_train, target_train, test_size=homogeneous_data_proportion))

    mask_even = (target_train_ordered==-1)
    mask_odd = (target_train_ordered==1)

    covariate_even, target_even = covariate_train_ordered[mask_even,:], target_train_ordered[mask_even]
    covariate_odd, target_odd = covariate_train_ordered[mask_odd,:], target_train_ordered[mask_odd]

    nb_even = nb_honest//2
    nb_odd = nb_honest-nb_even

    batch = target_even.shape[0]//(nb_even)
    for i_honest in range(nb_even):
        covariate_train_batches.append(covariate_even[i_honest*batch:(i_honest+1)*batch:,...])
        target_train_batches.append(target_even[i_honest*batch:(i_honest+1)*batch:])
        
    batch = target_odd.shape[0]//(nb_odd)
    for i_honest in range(nb_odd):
        covariate_train_batches.append(covariate_odd[i_honest*batch:(i_honest+1)*batch:,...])
        target_train_batches.append(target_odd[i_honest*batch:(i_honest+1)*batch:])

    # We add a bit of homogeneous data

    unordered_size = covariate_train_unordered.shape[0]//len(np.unique(target))
    for i_honest in range(len(covariate_train_batches)):
        
        cov_honest_i = np.vstack([
                covariate_train_batches[i_honest], 
                covariate_train_unordered[unordered_size * i_honest:unordered_size * (i_honest+1),...]
            ])
        covariate_train_batches[i_honest] = cov_honest_i
        
        tar_honest_i = np.concatenate((
                target_train_batches[i_honest], 
                target_train_unordered[unordered_size * i_honest:unordered_size * (i_honest+1)]
            ))
        
        target_train_batches[i_honest] = tar_honest_i

    nb_honest = len(target_train_batches)

    return {
        "nb_honest":nb_honest,
        "covariate_train_batches":covariate_train_batches,
        "target_train_batches":target_train_batches,
        "target_test":target_test,
        "covariate_test":covariate_test
        }
