Module data_preprocess.artificial_dataset

Expand source code
#!/usr/bin/env python3

from .read_file_cache import cacheItemThreadUnsafe, cacheMakeKey, cacheGetItem, cacheHasItem
from .fl_dataset import FLDataset
import numpy as np

# Import PyTorch root package import torch
import torch
import math


class ArificialDataset(FLDataset):
    """
    Based FL class that loads H5 type data_preprocess.
    """
    def __init__(self, exec_ctx, args, train=None, client_id=None, transform=None, target_transform=None):
        """
        The constructor for a synthetic dataset.

        Args:
            exec_ctx: execution context from which random number generator should be use
            args: command-line argument with generation specification
            train(bool): True if we're in training mode
            client_id (int): make the view of the dataset as we work from the point of view of client client_id
            transform: input transformation applied to input attributes before feeding input into the compute of loss
            target_transform: output or label transformation applied to response variable before computation of loss
        """
        genSpecList = args.dataset_generation_spec.split(",")
        genSpec = {}
        for item in genSpecList:
            k, v = item.split(':')
            genSpec[k] = v

        self.transform = transform
        self.target_transform = target_transform

        self.num_clients = int(genSpec['clients'])
        self.n_client_samples = int(genSpec['samples_per_client'])

        d = int(genSpec['variables'])

        rows = self.num_clients * self.n_client_samples
        cols = d

        if train is None or train == True:
            pass
        else:
            pass

        xSoltMultiplier = 10.0
        xSolution = np.ones(d) * xSoltMultiplier
        b_perurbation = 0.0

        if int(genSpec['homogeneous']) == 1:
            # Returns a anumpy array filled with random numbers from a uniform distribution on the interval [0, 1)[0,1)
            Ai = exec_ctx.np_random.rand(rows // self.num_clients, cols)
            U, S, Vt = np.linalg.svd(Ai, full_matrices=True)
            L = float(genSpec['l'])
            mu = float(genSpec['mu'])
            S = np.zeros((U.shape[1], Vt.shape[0]))
            len_s = min(S.shape[0], S.shape[1])
            if len_s > 1:
                for i in range(len_s):
                    S[i][i] = math.sqrt((L - mu) * float(i) / (len_s - 1) + mu)
            else:
                S[0][0] = math.sqrt(L)

            Ai = U @ S @ Vt
            Bi = Ai @ xSolution
            Bi = Bi.reshape(-1, 1)
            Bi += b_perurbation * exec_ctx.np_random.rand(*(Bi.shape))  # random perturbation

            Ai *= math.sqrt(Ai.shape[0]/2.0)
            Bi *= math.sqrt(Ai.shape[0]/2.0)

            A = []
            B = []
            for c in range(self.num_clients):
                A.append(Ai)
                B.append(Bi)
            A = np.vstack(A)
            B = np.vstack(B)
        else:
            A = exec_ctx.np_random.rand(rows, cols)
            U, S, Vt = np.linalg.svd(A, full_matrices=True)
            L = float(genSpec['l'])
            mu = float(genSpec['mu'])

            S = np.zeros((U.shape[1], Vt.shape[0]))
            len_s = min(S.shape[0], S.shape[1])

            if len_s > 1:
                for i in range(len_s):
                    S[i][i] = math.sqrt((L - mu) * float(i)/(len_s - 1) + mu)
            else:
                S[0][0] = math.sqrt(L)

            A = U @ S @ Vt
            B = A @ xSolution
            B = B.reshape(-1, 1)
            B += b_perurbation * exec_ctx.np_random.rand(*(B.shape))  # random perturbation

            # Extra scaling to have L and \mu specifically for function f(x) = 1/n * sum(a_i * x - b_i)**2
            A *= math.sqrt(A.shape[0]/2.0)
            B *= math.sqrt(A.shape[0]/2.0)

        # self.data = torch.from_numpy(A).float()
        # self.targets = torch.from_numpy(B).float()

        self.data = A
        self.targets = B

        self.targets = torch.Tensor(self.targets)
        self.data = torch.Tensor(self.data)

        # ==============================================================================================================
        # Move data to GPU maybe
        # self.store_in_target_device = args.store_data_in_target_device
        # Move data to target device
        # if self.store_in_target_device:
        #    self.targets = self.targets.to(device = args.device)
        #    self.data = self.data.to(device = args.device)
        # ==============================================================================================================

        self.set_client(client_id)

    def compute_Li_for_linear_regression(self):
        # ==============================================================================================================
        # Compute L, Li for linear regression
        # ==============================================================================================================
        A = self.data
        self.L = ((2/A.shape[0]) * torch.linalg.norm(A, 2)**2).item()
        self.Li_all_clients = []

        for c in range(self.num_clients):
            self.set_client(c)
            subdata = self.data[int(self.client_id) * self.n_client_samples:
                                (int(self.client_id) + 1) * self.n_client_samples, ...]

            Li = ((2/subdata.shape[0]) * torch.linalg.norm(subdata, 2) ** 2).item()
            self.Li_all_clients.append(Li)

        assert max(self.Li_all_clients) + 1.0e+3 >= self.L
        assert max(self.Li_all_clients) - 1.0e+3 <= self.L * self.num_clients

    def set_client(self, index=None):
        """
        Set pointer to client's data_preprocess corresponding to index.
        If index is none complete dataset as union of all datapoint will be observable by higher level.

        Args:
            index(int): index of client.

        Returns:
            None
        """
        if index is None:
            self.client_id = None
            self.length = len(self.data)
        else:
            if index < 0 or index >= self.num_clients:
                raise ValueError('Number of clients is out of bounds.')
            self.client_id = index
            self.length = self.n_client_samples

    def load_data(self):
        """
        Explicit load all need datasets from the filesystem or cache for specific dataset instance.
        """
        pass

    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is index of the target class.
        """
        if self.client_id is None:
            actual_index = index
        else:
            actual_index = int(self.client_id) * self.n_client_samples + index
        img, target = self.data[actual_index], self.targets[actual_index]

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        # TODO: If __getitem__ will always fetch object from the CPU memory.
        # Suggestion use GPU memory or another GPU as a cache storage
        # return torch.from_numpy(img).float(), torch.from_numpy(target).float()
        # reference to objects from dataset (by reference)
        return img.detach(), target.detach()

    def __len__(self):
        return self.length

Classes

class ArificialDataset (exec_ctx, args, train=None, client_id=None, transform=None, target_transform=None)

Based FL class that loads H5 type data_preprocess.

The constructor for a synthetic dataset.

Args

exec_ctx
execution context from which random number generator should be use
args
command-line argument with generation specification
train(bool): True if we're in training mode
client_id : int
make the view of the dataset as we work from the point of view of client client_id
transform
input transformation applied to input attributes before feeding input into the compute of loss
target_transform
output or label transformation applied to response variable before computation of loss
Expand source code
class ArificialDataset(FLDataset):
    """
    Based FL class that loads H5 type data_preprocess.
    """
    def __init__(self, exec_ctx, args, train=None, client_id=None, transform=None, target_transform=None):
        """
        The constructor for a synthetic dataset.

        Args:
            exec_ctx: execution context from which random number generator should be use
            args: command-line argument with generation specification
            train(bool): True if we're in training mode
            client_id (int): make the view of the dataset as we work from the point of view of client client_id
            transform: input transformation applied to input attributes before feeding input into the compute of loss
            target_transform: output or label transformation applied to response variable before computation of loss
        """
        genSpecList = args.dataset_generation_spec.split(",")
        genSpec = {}
        for item in genSpecList:
            k, v = item.split(':')
            genSpec[k] = v

        self.transform = transform
        self.target_transform = target_transform

        self.num_clients = int(genSpec['clients'])
        self.n_client_samples = int(genSpec['samples_per_client'])

        d = int(genSpec['variables'])

        rows = self.num_clients * self.n_client_samples
        cols = d

        if train is None or train == True:
            pass
        else:
            pass

        xSoltMultiplier = 10.0
        xSolution = np.ones(d) * xSoltMultiplier
        b_perurbation = 0.0

        if int(genSpec['homogeneous']) == 1:
            # Returns a anumpy array filled with random numbers from a uniform distribution on the interval [0, 1)[0,1)
            Ai = exec_ctx.np_random.rand(rows // self.num_clients, cols)
            U, S, Vt = np.linalg.svd(Ai, full_matrices=True)
            L = float(genSpec['l'])
            mu = float(genSpec['mu'])
            S = np.zeros((U.shape[1], Vt.shape[0]))
            len_s = min(S.shape[0], S.shape[1])
            if len_s > 1:
                for i in range(len_s):
                    S[i][i] = math.sqrt((L - mu) * float(i) / (len_s - 1) + mu)
            else:
                S[0][0] = math.sqrt(L)

            Ai = U @ S @ Vt
            Bi = Ai @ xSolution
            Bi = Bi.reshape(-1, 1)
            Bi += b_perurbation * exec_ctx.np_random.rand(*(Bi.shape))  # random perturbation

            Ai *= math.sqrt(Ai.shape[0]/2.0)
            Bi *= math.sqrt(Ai.shape[0]/2.0)

            A = []
            B = []
            for c in range(self.num_clients):
                A.append(Ai)
                B.append(Bi)
            A = np.vstack(A)
            B = np.vstack(B)
        else:
            A = exec_ctx.np_random.rand(rows, cols)
            U, S, Vt = np.linalg.svd(A, full_matrices=True)
            L = float(genSpec['l'])
            mu = float(genSpec['mu'])

            S = np.zeros((U.shape[1], Vt.shape[0]))
            len_s = min(S.shape[0], S.shape[1])

            if len_s > 1:
                for i in range(len_s):
                    S[i][i] = math.sqrt((L - mu) * float(i)/(len_s - 1) + mu)
            else:
                S[0][0] = math.sqrt(L)

            A = U @ S @ Vt
            B = A @ xSolution
            B = B.reshape(-1, 1)
            B += b_perurbation * exec_ctx.np_random.rand(*(B.shape))  # random perturbation

            # Extra scaling to have L and \mu specifically for function f(x) = 1/n * sum(a_i * x - b_i)**2
            A *= math.sqrt(A.shape[0]/2.0)
            B *= math.sqrt(A.shape[0]/2.0)

        # self.data = torch.from_numpy(A).float()
        # self.targets = torch.from_numpy(B).float()

        self.data = A
        self.targets = B

        self.targets = torch.Tensor(self.targets)
        self.data = torch.Tensor(self.data)

        # ==============================================================================================================
        # Move data to GPU maybe
        # self.store_in_target_device = args.store_data_in_target_device
        # Move data to target device
        # if self.store_in_target_device:
        #    self.targets = self.targets.to(device = args.device)
        #    self.data = self.data.to(device = args.device)
        # ==============================================================================================================

        self.set_client(client_id)

    def compute_Li_for_linear_regression(self):
        # ==============================================================================================================
        # Compute L, Li for linear regression
        # ==============================================================================================================
        A = self.data
        self.L = ((2/A.shape[0]) * torch.linalg.norm(A, 2)**2).item()
        self.Li_all_clients = []

        for c in range(self.num_clients):
            self.set_client(c)
            subdata = self.data[int(self.client_id) * self.n_client_samples:
                                (int(self.client_id) + 1) * self.n_client_samples, ...]

            Li = ((2/subdata.shape[0]) * torch.linalg.norm(subdata, 2) ** 2).item()
            self.Li_all_clients.append(Li)

        assert max(self.Li_all_clients) + 1.0e+3 >= self.L
        assert max(self.Li_all_clients) - 1.0e+3 <= self.L * self.num_clients

    def set_client(self, index=None):
        """
        Set pointer to client's data_preprocess corresponding to index.
        If index is none complete dataset as union of all datapoint will be observable by higher level.

        Args:
            index(int): index of client.

        Returns:
            None
        """
        if index is None:
            self.client_id = None
            self.length = len(self.data)
        else:
            if index < 0 or index >= self.num_clients:
                raise ValueError('Number of clients is out of bounds.')
            self.client_id = index
            self.length = self.n_client_samples

    def load_data(self):
        """
        Explicit load all need datasets from the filesystem or cache for specific dataset instance.
        """
        pass

    def __getitem__(self, index):
        """
        Args:
            index (int): Index

        Returns:
            tuple: (image, target) where target is index of the target class.
        """
        if self.client_id is None:
            actual_index = index
        else:
            actual_index = int(self.client_id) * self.n_client_samples + index
        img, target = self.data[actual_index], self.targets[actual_index]

        if self.transform is not None:
            img = self.transform(img)

        if self.target_transform is not None:
            target = self.target_transform(target)

        # TODO: If __getitem__ will always fetch object from the CPU memory.
        # Suggestion use GPU memory or another GPU as a cache storage
        # return torch.from_numpy(img).float(), torch.from_numpy(target).float()
        # reference to objects from dataset (by reference)
        return img.detach(), target.detach()

    def __len__(self):
        return self.length

Ancestors

  • data_preprocess.fl_dataset.FLDataset
  • torch.utils.data.dataset.Dataset
  • typing.Generic

Methods

def compute_Li_for_linear_regression(self)
Expand source code
def compute_Li_for_linear_regression(self):
    # ==============================================================================================================
    # Compute L, Li for linear regression
    # ==============================================================================================================
    A = self.data
    self.L = ((2/A.shape[0]) * torch.linalg.norm(A, 2)**2).item()
    self.Li_all_clients = []

    for c in range(self.num_clients):
        self.set_client(c)
        subdata = self.data[int(self.client_id) * self.n_client_samples:
                            (int(self.client_id) + 1) * self.n_client_samples, ...]

        Li = ((2/subdata.shape[0]) * torch.linalg.norm(subdata, 2) ** 2).item()
        self.Li_all_clients.append(Li)

    assert max(self.Li_all_clients) + 1.0e+3 >= self.L
    assert max(self.Li_all_clients) - 1.0e+3 <= self.L * self.num_clients
def load_data(self)

Explicit load all need datasets from the filesystem or cache for specific dataset instance.

Expand source code
def load_data(self):
    """
    Explicit load all need datasets from the filesystem or cache for specific dataset instance.
    """
    pass
def set_client(self, index=None)

Set pointer to client's data_preprocess corresponding to index. If index is none complete dataset as union of all datapoint will be observable by higher level.

Args

index(int): index of client.

Returns

None

Expand source code
def set_client(self, index=None):
    """
    Set pointer to client's data_preprocess corresponding to index.
    If index is none complete dataset as union of all datapoint will be observable by higher level.

    Args:
        index(int): index of client.

    Returns:
        None
    """
    if index is None:
        self.client_id = None
        self.length = len(self.data)
    else:
        if index < 0 or index >= self.num_clients:
            raise ValueError('Number of clients is out of bounds.')
        self.client_id = index
        self.length = self.n_client_samples