Module data_preprocess.fl_datasets.cifar10_by_class
Expand source code
from torchvision.datasets import CIFAR10
from PIL import Image
import numpy as np
import torch
class FLCifar10ByClass(CIFAR10):
"""
CIFAR10 Dataset.
num_clients clients that were allocated data_preprocess uniformly at random.
"""
def __init__(self, exec_ctx, args, root, train=True, transform=None, target_transform=None, download=False, client_id=None):
super(FLCifar10ByClass, self).__init__(root, train=train, transform=transform, target_transform=target_transform, download=download)
self.num_clients = 10
num_datapoints = len(self.targets) # Total number of datapoints in dataset
num_datapoints_per_class = {} # Key - class, Value - number of samples of specific class
class_to_points_indicies = {} # Key - class, Value - list of point indicies
for i in range(num_datapoints):
if self.targets[i] in num_datapoints_per_class:
num_datapoints_per_class[self.targets[i]] += 1
class_to_points_indicies[self.targets[i]].append(i)
else:
num_datapoints_per_class[self.targets[i]] = 0
class_to_points_indicies[self.targets[i]] = list([i])
classes = list(class_to_points_indicies.keys())
assert len(classes) == 10
# Spread each class across p(10%) clients
p = 0.10
client_id_to_classes = classes * int(self.num_clients * p)
exec_ctx.np_random.shuffle(client_id_to_classes)
classes_to_client_id = {}
for c in classes:
classes_to_client_id[c] = list()
for i, c in enumerate(client_id_to_classes):
classes_to_client_id[c].append(i % self.num_clients)
# Reshuflle point indices
for c in classes:
exec_ctx.np_random.shuffle(class_to_points_indicies[c])
# Finally split data across clients
self.datapoints_per_client = {}
for client in range(self.num_clients):
self.datapoints_per_client[client] = list()
for c, clients in classes_to_client_id.items():
points = class_to_points_indicies[c]
num_clients_for_class = len(clients)
#assert num_clients_for_class == 10
num_points_per_client = len(points) // num_clients_for_class
for k in range(num_clients_for_class):
subpoints = points[(k)*num_points_per_client : (k+1)*num_points_per_client]
self.datapoints_per_client[clients[k]] += subpoints
# Reshuffle data points
for client in range(self.num_clients):
exec_ctx.np_random.shuffle(self.datapoints_per_client[client])
# Mapping between classes and client ids'
self.classes_to_client_id = classes_to_client_id
# Total number of samples
self.total_samples = num_datapoints
# self.store_in_target_device = args.store_data_in_target_device
#self.targets = torch.Tensor(self.targets)
#self.data = torch.Tensor(self.data)
# Move data to GPU maybe
# Move data to target device
#if self.store_in_target_device:
# self.targets = self.targets.to(device = args.device)
# self.data = self.data.to(device = args.device)
# ==============================================================================================================
# test
make_test = True
if make_test:
for i in range(10):
cc1 = self.get_clients_that_stores_class_naively(2)
cc2 = self.get_clients_that_stores_class(2)
assert cc1 == cc2
# ==============================================================================================================
self.set_client(client_id)
def get_clients_that_stores_class(self, k):
return list(set(self.classes_to_client_id[k]))
def get_clients_that_stores_class_naively(self, k):
clients = []
for i, c in enumerate(self.targets):
if c != k:
continue
for client in range(self.num_clients):
if i in self.datapoints_per_client[client]:
clients.append(client)
break
return list(set(clients))
def set_client(self, index=None):
""" Set current client.
Args:
index(int): index of current client. If index is None the partitioned dataset is considered as
one single dataset
Returns:
int: Numer of train points for a current client
"""
if index is None:
self.client_id = None
self.length = len(self.data)
else:
if index < 0 or index >= self.num_clients:
raise ValueError('Number of clients is out of bounds.')
self.client_id = index
self.length = len(self.datapoints_per_client[index])
def __getitem__(self, index):
"""
Args:
index (int): Index of item that is fetched on behalf on current setuped client
Returns:
tuple: (image, target) where target is index of the target class.
"""
if self.client_id is None:
actual_index = index
else:
actual_index = self.datapoints_per_client[self.client_id][index]
img, target = self.data[actual_index], self.targets[actual_index]
# doing this so that it is consistent with all other fl_datasets to return a PIL Image
if isinstance(img, np.ndarray):
img = Image.fromarray(img)
if self.transform is not None:
img = self.transform(img)
if self.target_transform is not None:
target = self.target_transform(target)
# TODO: If __getitem__ will always fetch object from the CPU memory. Suggestion use GPU memory or another GPU as a cache storage
return img, target
def __len__(self):
""" Get length of dataset for a current client
Returns:
int: Numer of train points for a current client
"""
return self.length
Classes
class FLCifar10ByClass (exec_ctx, args, root, train=True, transform=None, target_transform=None, download=False, client_id=None)
-
CIFAR10 Dataset. num_clients clients that were allocated data_preprocess uniformly at random.
Expand source code
class FLCifar10ByClass(CIFAR10): """ CIFAR10 Dataset. num_clients clients that were allocated data_preprocess uniformly at random. """ def __init__(self, exec_ctx, args, root, train=True, transform=None, target_transform=None, download=False, client_id=None): super(FLCifar10ByClass, self).__init__(root, train=train, transform=transform, target_transform=target_transform, download=download) self.num_clients = 10 num_datapoints = len(self.targets) # Total number of datapoints in dataset num_datapoints_per_class = {} # Key - class, Value - number of samples of specific class class_to_points_indicies = {} # Key - class, Value - list of point indicies for i in range(num_datapoints): if self.targets[i] in num_datapoints_per_class: num_datapoints_per_class[self.targets[i]] += 1 class_to_points_indicies[self.targets[i]].append(i) else: num_datapoints_per_class[self.targets[i]] = 0 class_to_points_indicies[self.targets[i]] = list([i]) classes = list(class_to_points_indicies.keys()) assert len(classes) == 10 # Spread each class across p(10%) clients p = 0.10 client_id_to_classes = classes * int(self.num_clients * p) exec_ctx.np_random.shuffle(client_id_to_classes) classes_to_client_id = {} for c in classes: classes_to_client_id[c] = list() for i, c in enumerate(client_id_to_classes): classes_to_client_id[c].append(i % self.num_clients) # Reshuflle point indices for c in classes: exec_ctx.np_random.shuffle(class_to_points_indicies[c]) # Finally split data across clients self.datapoints_per_client = {} for client in range(self.num_clients): self.datapoints_per_client[client] = list() for c, clients in classes_to_client_id.items(): points = class_to_points_indicies[c] num_clients_for_class = len(clients) #assert num_clients_for_class == 10 num_points_per_client = len(points) // num_clients_for_class for k in range(num_clients_for_class): subpoints = points[(k)*num_points_per_client : (k+1)*num_points_per_client] self.datapoints_per_client[clients[k]] += subpoints # Reshuffle data points for client in range(self.num_clients): exec_ctx.np_random.shuffle(self.datapoints_per_client[client]) # Mapping between classes and client ids' self.classes_to_client_id = classes_to_client_id # Total number of samples self.total_samples = num_datapoints # self.store_in_target_device = args.store_data_in_target_device #self.targets = torch.Tensor(self.targets) #self.data = torch.Tensor(self.data) # Move data to GPU maybe # Move data to target device #if self.store_in_target_device: # self.targets = self.targets.to(device = args.device) # self.data = self.data.to(device = args.device) # ============================================================================================================== # test make_test = True if make_test: for i in range(10): cc1 = self.get_clients_that_stores_class_naively(2) cc2 = self.get_clients_that_stores_class(2) assert cc1 == cc2 # ============================================================================================================== self.set_client(client_id) def get_clients_that_stores_class(self, k): return list(set(self.classes_to_client_id[k])) def get_clients_that_stores_class_naively(self, k): clients = [] for i, c in enumerate(self.targets): if c != k: continue for client in range(self.num_clients): if i in self.datapoints_per_client[client]: clients.append(client) break return list(set(clients)) def set_client(self, index=None): """ Set current client. Args: index(int): index of current client. If index is None the partitioned dataset is considered as one single dataset Returns: int: Numer of train points for a current client """ if index is None: self.client_id = None self.length = len(self.data) else: if index < 0 or index >= self.num_clients: raise ValueError('Number of clients is out of bounds.') self.client_id = index self.length = len(self.datapoints_per_client[index]) def __getitem__(self, index): """ Args: index (int): Index of item that is fetched on behalf on current setuped client Returns: tuple: (image, target) where target is index of the target class. """ if self.client_id is None: actual_index = index else: actual_index = self.datapoints_per_client[self.client_id][index] img, target = self.data[actual_index], self.targets[actual_index] # doing this so that it is consistent with all other fl_datasets to return a PIL Image if isinstance(img, np.ndarray): img = Image.fromarray(img) if self.transform is not None: img = self.transform(img) if self.target_transform is not None: target = self.target_transform(target) # TODO: If __getitem__ will always fetch object from the CPU memory. Suggestion use GPU memory or another GPU as a cache storage return img, target def __len__(self): """ Get length of dataset for a current client Returns: int: Numer of train points for a current client """ return self.length
Ancestors
- torchvision.datasets.cifar.CIFAR10
- torchvision.datasets.vision.VisionDataset
- torch.utils.data.dataset.Dataset
- typing.Generic
Methods
def get_clients_that_stores_class(self, k)
-
Expand source code
def get_clients_that_stores_class(self, k): return list(set(self.classes_to_client_id[k]))
def get_clients_that_stores_class_naively(self, k)
-
Expand source code
def get_clients_that_stores_class_naively(self, k): clients = [] for i, c in enumerate(self.targets): if c != k: continue for client in range(self.num_clients): if i in self.datapoints_per_client[client]: clients.append(client) break return list(set(clients))
def set_client(self, index=None)
-
Set current client.
Args
index(int): index of current client. If index is None the partitioned dataset is considered as one single dataset
Returns
int
- Numer of train points for a current client
Expand source code
def set_client(self, index=None): """ Set current client. Args: index(int): index of current client. If index is None the partitioned dataset is considered as one single dataset Returns: int: Numer of train points for a current client """ if index is None: self.client_id = None self.length = len(self.data) else: if index < 0 or index >= self.num_clients: raise ValueError('Number of clients is out of bounds.') self.client_id = index self.length = len(self.datapoints_per_client[index])