Skip to content

lipdp.pipeline module

DatasetMetadata dataclass

class that handle dataset metadata that will be used to compute privacy guarantees

Source code in lipdp/pipeline.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
@dataclass
class DatasetMetadata:
    """
    class that handle dataset metadata that will be used
    to compute privacy guarantees
    """

    input_shape: Tuple[int, int, int]
    nb_classes: int
    nb_samples_train: int
    nb_samples_test: int
    class_names: List[str]
    nb_steps_per_epochs: int
    batch_size: int
    max_norm: float

load_and_prepare_data(dataset_name='mnist', batch_size=256, colorspace='RGB', drop_remainder=True, augmentation_fct=None, bound_fct=None)

load dataset_name data using tensorflow datasets.

Parameters:

Name Type Description Default
dataset_name str

name of the dataset to load.

'mnist'
batch_size int

batch size

256
colorspace str

one of RGB, HSV, YIQ, YUV

'RGB'
drop_remainder bool

when true drop the last batch if it has less than batch_size elements. Defaults to True.

True
augmentation_fct callable

data augmentation to be applied to train. the function must take a tuple (img, label) and return a tuple of (img, label). Defaults to None.

None
bound_fct callable

function that is responsible of bounding the inputs. Can be None, bound_normalize or bound_clip_value. None means that no clipping is performed, and max theoretical value is reported ( sqrt(whc) ). bound_normalize means that each input is normalized setting the bound to 1. bound_clip_value will clip norm to defined value.

None

Returns:

Type Description

ds_train, ds_test, metadat: two dataset, with data preparation, augmentation, shuffling and batching. Also return an DatasetMetadata object with infos about the dataset.

Source code in lipdp/pipeline.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def load_and_prepare_data(
    dataset_name: str = "mnist",
    batch_size: int = 256,
    colorspace: str = "RGB",
    drop_remainder=True,
    augmentation_fct=None,
    bound_fct=None,
):
    """
    load dataset_name data using tensorflow datasets.

    Args:
        dataset_name (str): name of the dataset to load.
        batch_size (int): batch size
        colorspace (str): one of RGB, HSV, YIQ, YUV
        drop_remainder (bool, optional): when true drop the last batch if it
            has less than batch_size elements. Defaults to True.
        augmentation_fct (callable, optional): data augmentation to be applied
            to train. the function must take a tuple (img, label) and return a
            tuple of (img, label). Defaults to None.
        bound_fct (callable, optional): function that is responsible of
            bounding the inputs. Can be None, bound_normalize or bound_clip_value.
            None means that no clipping is performed, and max theoretical value is
            reported ( sqrt(w*h*c) ). bound_normalize means that each input is
            normalized setting the bound to 1. bound_clip_value will clip norm to
            defined value.

    Returns:
        ds_train, ds_test, metadat: two dataset, with data preparation,
            augmentation, shuffling and batching. Also return an
            DatasetMetadata object with infos about the dataset.
    """
    # load data
    (ds_train, ds_test), ds_info = tfds.load(
        dataset_name,
        split=["train", "test"],
        shuffle_files=True,
        as_supervised=True,
        with_info=True,
    )
    # handle case where functions are None
    if augmentation_fct is None:
        augmentation_fct = lambda x, y: (x, y)
    # None bound yield default trivial bound
    nb_classes = ds_info.features["label"].num_classes
    input_shape = ds_info.features["image"].shape
    if bound_fct is None:
        bound_fct = (
            lambda x, y: (x, y),
            input_shape[-3] * input_shape[-2] * input_shape[-1],
        )
    bound_callable, bound_val = bound_fct
    # train pipeline
    ds_train = (
        ds_train.map(  # map to 0,1 and one hot encode
            lambda x, y: (
                tf.cast(x, tf.float32) / 255.0,
                tf.one_hot(y, nb_classes),
            ),
            num_parallel_calls=tf.data.AUTOTUNE,
        )
        .shuffle(  # shuffle
            min(batch_size * 10, max(batch_size, ds_train.cardinality())),
            reshuffle_each_iteration=True,
        )
        .map(augmentation_fct, num_parallel_calls=tf.data.AUTOTUNE)  # augment
        .map(  # map colorspace
            get_colorspace_function(colorspace),
            num_parallel_calls=tf.data.AUTOTUNE,
        )
        .map(bound_callable, num_parallel_calls=tf.data.AUTOTUNE)  # apply bound
        .batch(batch_size, drop_remainder=drop_remainder)  # batch
        .prefetch(tf.data.AUTOTUNE)
    )

    ds_test = (
        ds_test.map(
            lambda x, y: (
                tf.cast(x, tf.float32) / 255.0,
                tf.one_hot(y, nb_classes),
            ),
            num_parallel_calls=tf.data.AUTOTUNE,
        )
        .map(
            get_colorspace_function(colorspace),
            num_parallel_calls=tf.data.AUTOTUNE,
        )
        .shuffle(
            min(batch_size * 10, max(batch_size, ds_test.cardinality())),
            reshuffle_each_iteration=True,
        )
        .batch(batch_size, drop_remainder=drop_remainder)
        .prefetch(tf.data.AUTOTUNE)
    )
    # get dataset metadata
    metadata = DatasetMetadata(
        input_shape=ds_info.features["image"].shape,
        nb_classes=ds_info.features["label"].num_classes,
        nb_samples_train=ds_info.splits["train"].num_examples,
        nb_samples_test=ds_info.splits["test"].num_examples,
        class_names=ds_info.features["label"].names,
        nb_steps_per_epochs=ds_train.cardinality().numpy()
        if ds_train.cardinality() > 0  # handle case cardinality return -1 (unknown)
        else ds_info.splits["train"].num_examples / batch_size,
        batch_size=batch_size,
        max_norm=bound_val,
    )

    return ds_train, ds_test, metadata