如何使用 tf.data.Dataset.map 计算两个 tf.data.Datasets 的元素总和,两者无限迭代?
How to do the element-wise sum of two tf.data.Datasets, both iterating indefinitely, with tf.data.Dataset.map?
我想在基于 tf.data 的管道中编写一个 mixup 数据增强 [1] 函数。
我生成了一个 tf.data.Dataset 和我想用来扩充训练示例的示例。
我想将dataset_train的元素feat_train、label_train映射到feat_train + feat_aug, label_train, label_aug, feat_aug 和 label_aug 是 dataset_aug 的元素,因此两个数据集都是无限期的迭代,例如对于具有 3 个元素的 dataset_train 和具有 2 个元素的 dataset_aug:
feat_train[0], label_train[0] -> feat_train[0] + feat_aug[0], label_train[0] + label_aug[0]
feat_train[1], label_train[1] -> feat_train[1] + feat_aug[1], label_train[1] + label_aug[1]
feat_train[2], label_train[2] -> feat_train[2] + feat_aug[0], label_train[2] + label_aug[0]
feat_train[0], label_train[0] -> feat_train[0] + feat_aug[1], label_train[0] + label_aug[1]
feat_train[1], label_train[1] -> feat_train[1] + feat_aug[0], label_train[1] + label_aug[0]
...
如何在我的混合功能中获得这种行为?是否有任何其他推荐的方法对 2 tf.data.Datasets 无限迭代执行逐元素操作?
[1]张鸿义等。 "mixup: Beyond empirical risk minimization." arXiv 预印本 arXiv:1710.09412 (2017).
# files_train and files_aug are lists of TFRecord files.
# parse TFRecords to get training example features and
# one-hot encoded labels
dataset_train = tf.data.TFRecordDataset(files_train)
dataset_train = dataset_train.map(
lambda x: serialized2data(x, feature_shape, class_list))
dataset_train = dataset_train.shuffle(10000)
dataset_train = dataset_train.repeat() # Repeat indefinitely.
# parse TFRecords to get augmentation example features and
# one-hot encoded labels
dataset_aug = tf.data.TFRecordDataset(files_aug)
dataset_aug = dataset_aug.map(
lambda x: serialized2data(x, feature_shape, class_list))
dataset_aug = dataset_aug.repeat() # Repeat indefinitely.
# augment data (mixup)
# Here how can I write a map function so that the features of every item
# of dataset_train is mixed with an item of dataset_aug ?
# something like
# dataset_train = dataset_train.map(
# lambda feat_train, label_train: mixup(
# feat_train, label_train, feat_aug, label_aug)
# )
# ?
# but how can I iterate dataset_aug to get feat_aug and label_aug ?
# make batch
dataset_train = dataset_train.batch(batch_size, drop_remainder=True)
return dataset
def mixup(feat_train, label_train, feat_aug, label_aug):
# Shown as an example. This will be more complicated...
return (feat_train + feat_aug,
label_train + label_aug)
def serialized2data(
serialized_data,
feature_shape,
class_list,
data_format='channels_first',
training=True):
"""Generate features, labels and, if training is False, filenames and times.
Labels are indices of original label in class_list.
Args:
serialized_data: data serialized using utils.tf_utils.serialize_data
feature_shape: shape of the features. Can be obtained with
feature_extractor.feature_shape (see utils.feature_utils)
class_list: list of class ids (used for one-hot encoding the labels)
data_format: 'channels_first' (NCHW) or 'channels_last' (NHWC).
Default is set to 'channels_first' because it is faster on GPU
(https://www.tensorflow.org/guide/performance/overview#data_formats).
"""
features = {
'filename': tf.io.FixedLenFeature([], tf.string),
'times': tf.io.FixedLenFeature([2], tf.float32),
'data': tf.io.FixedLenFeature(feature_shape, tf.float32),
'labels': tf.io.FixedLenFeature([], tf.string),
}
example = tf.io.parse_single_example(serialized_data, features)
# reshape data to channels_first format
if data_format == 'channels_first':
data = tf.reshape(example['data'], (1, feature_shape[0], feature_shape[1]))
else:
data = tf.reshape(example['data'], (feature_shape[0], feature_shape[1], 1))
# one-hot encode labels
labels = tf.strings.to_number(
tf.string_split([example['labels']], '#').values,
out_type=tf.int32
)
# get intersection of class_list and labels
labels = tf.squeeze(
tf.sparse.to_dense(
tf.sets.intersection(
tf.expand_dims(labels, axis=0),
tf.expand_dims(class_list, axis=0)
)
),
axis=0
)
# sort class_list and get indices of labels in class_list
class_list = tf.sort(class_list)
labels = tf.where(
tf.equal(
tf.expand_dims(labels, axis=1),
class_list)
)[:,1]
tf.cond(
tf.math.logical_and(training, tf.equal(tf.size(labels), 0)),
true_fn=lambda:myprint(tf.strings.format('File {} has no label', example['filename'])),
false_fn=lambda:1
)
one_hot = tf.cond(
tf.equal(tf.size(labels), 0),
true_fn=lambda: tf.zeros(tf.size(class_list)),
false_fn=lambda: tf.reduce_max(tf.one_hot(labels, tf.size(class_list)), 0)
)
if training:
return (data, one_hot)
else:
return (data, one_hot, example['filename'], example['times'])
我正在提供一个示例代码,告诉您如何实现您所要求的 objective。我分别创建了长度为 3 和 2 的 train_dataset
和 aug_dataset
。两者都有图像和标签。图像的形状为 (64, 64, 3)。 train
的标签是 [10, 20, 30],aug
的标签是 [1, 2]。
特别注意标签输出,看看它们是否按照您想要的方式重复。
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
train_dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(3, 64, 64, 3),
np.array([10, 20, 30])))
aug_dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(2, 64, 64, 3),
np.arange(1, 3)))
train_dataset = train_dataset.repeat()
aug_dataset = aug_dataset.repeat()
dataset = tf.data.Dataset.zip((train_dataset, aug_dataset))
def add_datasets(dataset1, dataset2):
image_data = dataset1[0] + dataset2[0]
label_data = dataset1[1] + dataset2[1]
return image_data, label_data
dataset = dataset.map(add_datasets)
for a, b in dataset:
print(a.shape, b)
输出:
(64, 64, 3) tf.Tensor(11, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(22, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(31, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(12, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(21, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(32, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(11, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(22, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(31, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(12, shape=(), dtype=int64)
我想在基于 tf.data 的管道中编写一个 mixup 数据增强 [1] 函数。
我生成了一个 tf.data.Dataset 和我想用来扩充训练示例的示例。
我想将dataset_train的元素feat_train、label_train映射到feat_train + feat_aug, label_train, label_aug, feat_aug 和 label_aug 是 dataset_aug 的元素,因此两个数据集都是无限期的迭代,例如对于具有 3 个元素的 dataset_train 和具有 2 个元素的 dataset_aug:
feat_train[0], label_train[0] -> feat_train[0] + feat_aug[0], label_train[0] + label_aug[0]
feat_train[1], label_train[1] -> feat_train[1] + feat_aug[1], label_train[1] + label_aug[1]
feat_train[2], label_train[2] -> feat_train[2] + feat_aug[0], label_train[2] + label_aug[0]
feat_train[0], label_train[0] -> feat_train[0] + feat_aug[1], label_train[0] + label_aug[1]
feat_train[1], label_train[1] -> feat_train[1] + feat_aug[0], label_train[1] + label_aug[0]
...
如何在我的混合功能中获得这种行为?是否有任何其他推荐的方法对 2 tf.data.Datasets 无限迭代执行逐元素操作?
[1]张鸿义等。 "mixup: Beyond empirical risk minimization." arXiv 预印本 arXiv:1710.09412 (2017).
# files_train and files_aug are lists of TFRecord files.
# parse TFRecords to get training example features and
# one-hot encoded labels
dataset_train = tf.data.TFRecordDataset(files_train)
dataset_train = dataset_train.map(
lambda x: serialized2data(x, feature_shape, class_list))
dataset_train = dataset_train.shuffle(10000)
dataset_train = dataset_train.repeat() # Repeat indefinitely.
# parse TFRecords to get augmentation example features and
# one-hot encoded labels
dataset_aug = tf.data.TFRecordDataset(files_aug)
dataset_aug = dataset_aug.map(
lambda x: serialized2data(x, feature_shape, class_list))
dataset_aug = dataset_aug.repeat() # Repeat indefinitely.
# augment data (mixup)
# Here how can I write a map function so that the features of every item
# of dataset_train is mixed with an item of dataset_aug ?
# something like
# dataset_train = dataset_train.map(
# lambda feat_train, label_train: mixup(
# feat_train, label_train, feat_aug, label_aug)
# )
# ?
# but how can I iterate dataset_aug to get feat_aug and label_aug ?
# make batch
dataset_train = dataset_train.batch(batch_size, drop_remainder=True)
return dataset
def mixup(feat_train, label_train, feat_aug, label_aug):
# Shown as an example. This will be more complicated...
return (feat_train + feat_aug,
label_train + label_aug)
def serialized2data(
serialized_data,
feature_shape,
class_list,
data_format='channels_first',
training=True):
"""Generate features, labels and, if training is False, filenames and times.
Labels are indices of original label in class_list.
Args:
serialized_data: data serialized using utils.tf_utils.serialize_data
feature_shape: shape of the features. Can be obtained with
feature_extractor.feature_shape (see utils.feature_utils)
class_list: list of class ids (used for one-hot encoding the labels)
data_format: 'channels_first' (NCHW) or 'channels_last' (NHWC).
Default is set to 'channels_first' because it is faster on GPU
(https://www.tensorflow.org/guide/performance/overview#data_formats).
"""
features = {
'filename': tf.io.FixedLenFeature([], tf.string),
'times': tf.io.FixedLenFeature([2], tf.float32),
'data': tf.io.FixedLenFeature(feature_shape, tf.float32),
'labels': tf.io.FixedLenFeature([], tf.string),
}
example = tf.io.parse_single_example(serialized_data, features)
# reshape data to channels_first format
if data_format == 'channels_first':
data = tf.reshape(example['data'], (1, feature_shape[0], feature_shape[1]))
else:
data = tf.reshape(example['data'], (feature_shape[0], feature_shape[1], 1))
# one-hot encode labels
labels = tf.strings.to_number(
tf.string_split([example['labels']], '#').values,
out_type=tf.int32
)
# get intersection of class_list and labels
labels = tf.squeeze(
tf.sparse.to_dense(
tf.sets.intersection(
tf.expand_dims(labels, axis=0),
tf.expand_dims(class_list, axis=0)
)
),
axis=0
)
# sort class_list and get indices of labels in class_list
class_list = tf.sort(class_list)
labels = tf.where(
tf.equal(
tf.expand_dims(labels, axis=1),
class_list)
)[:,1]
tf.cond(
tf.math.logical_and(training, tf.equal(tf.size(labels), 0)),
true_fn=lambda:myprint(tf.strings.format('File {} has no label', example['filename'])),
false_fn=lambda:1
)
one_hot = tf.cond(
tf.equal(tf.size(labels), 0),
true_fn=lambda: tf.zeros(tf.size(class_list)),
false_fn=lambda: tf.reduce_max(tf.one_hot(labels, tf.size(class_list)), 0)
)
if training:
return (data, one_hot)
else:
return (data, one_hot, example['filename'], example['times'])
我正在提供一个示例代码,告诉您如何实现您所要求的 objective。我分别创建了长度为 3 和 2 的 train_dataset
和 aug_dataset
。两者都有图像和标签。图像的形状为 (64, 64, 3)。 train
的标签是 [10, 20, 30],aug
的标签是 [1, 2]。
特别注意标签输出,看看它们是否按照您想要的方式重复。
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
train_dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(3, 64, 64, 3),
np.array([10, 20, 30])))
aug_dataset = tf.data.Dataset.from_tensor_slices((np.random.rand(2, 64, 64, 3),
np.arange(1, 3)))
train_dataset = train_dataset.repeat()
aug_dataset = aug_dataset.repeat()
dataset = tf.data.Dataset.zip((train_dataset, aug_dataset))
def add_datasets(dataset1, dataset2):
image_data = dataset1[0] + dataset2[0]
label_data = dataset1[1] + dataset2[1]
return image_data, label_data
dataset = dataset.map(add_datasets)
for a, b in dataset:
print(a.shape, b)
输出:
(64, 64, 3) tf.Tensor(11, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(22, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(31, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(12, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(21, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(32, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(11, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(22, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(31, shape=(), dtype=int64)
(64, 64, 3) tf.Tensor(12, shape=(), dtype=int64)