在 Tensorflow 中过滤数据
Filter data in Tensorflow
我在 TensorFlow 中有一些列数据,我想对其中一列进行过滤,如下所示:
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow.compat.v1 as tfv1
tfv1.enable_v2_behavior()
csv_file = tf.keras.utils.get_file('heart.csv', 'https://storage.googleapis.com/applied-dl/heart.csv')
df = pd.read_csv(csv_file)
target = df.pop('target')
df['thal'] = pd.Categorical(df['thal'])
df['thal'] = df.thal.cat.codes
# Use interleave() and prefetch() to read many files concurrently.
#files = tf.data.Dataset.list_files(file_pattern=input_file_pattern, shuffle=True, seed=123456789)
#dataset = files.interleave(lambda x: tf.data.RecordIODataset(x).prefetch(100), cycle_length=8)
#Pretend I actually had some data files
dataset = tf.data.Dataset.from_tensor_slices((df.to_dict('list'), target.values))
dataset = dataset.shuffle(1000, seed=123456789)
dataset = dataset.batch(20)
#Pretend I did some parsing here
# dataset = dataset.map(parse_record, num_parallel_calls=20)
dataset = dataset.filter(lambda x, label: x['trestbps']<135)
但这会产生错误消息:
ValueError: predicate
return type must be convertible to a scalar boolean tensor. Was TensorSpec(shape=(None,), dtype=tf.bool, name=None)
.
我应该如何过滤数据?
这是因为您在 batch
之后应用了 filter
。
因此,在 lambda
表达式中,x
是形状为 (None,)
的批次(将 drop_reminder=True
传递给 batch
以获得 (20,)
的形状),而不是样本。要修复它,您必须在 batch
.
之前调用 filter
在 batch
之后有一个 "filter" 的解决方案,使用 map
代替。然而,正如您所看到的,这会产生批量变量大小的副作用:您在输入中获得了一批 20,并且您删除了不符合特定条件的元素(trestbps < 135),而不是从中删除相同数量的元素每一批。此外,该解决方案的性能非常糟糕...
import timeit
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow.compat.v1 as tfv1
tfv1.enable_v2_behavior()
def s1(ds):
dataset = ds
dataset = dataset.filter(lambda x, label: x['trestbps']<135)
dataset = dataset.batch(20)
return dataset
def s2(ds):
dataset = ds
dataset = dataset.batch(20)
dataset = dataset.map(lambda x, label: (tf.nest.map_structure(lambda y: y[x['trestbps'] < 135], x), label[x['trestbps'] < 135]))
return dataset
def base_ds():
csv_file = tf.keras.utils.get_file('heart.csv', 'https://storage.googleapis.com/applied-dl/heart.csv')
df = pd.read_csv(csv_file)
target = df.pop('target')
df['thal'] = pd.Categorical(df['thal'])
df['thal'] = df.thal.cat.codes
return tf.data.Dataset.from_tensor_slices((df.to_dict('list'), target.values))
def main():
ds = base_ds()
ds1 = s1(ds)
ds2 = s2(ds)
tf.print("DS_S1:", [tf.nest.map_structure(lambda x: x.shape, x) for x in ds1])
tf.print("DS_S2:", [tf.nest.map_structure(lambda x: x.shape, x) for x in ds2])
tf.print("Are equals?", [x for x in ds1] == [x for x in ds2])
tf.print("Contains same elements?", [x for x in ds1.unbatch()] == [x for x in ds2.unbatch()])
tf.print("Filter and batch:", timeit.timeit(lambda: s1(ds), number=100))
tf.print("Batch and map:", timeit.timeit(lambda: s2(ds), number=100))
if __name__ == '__main__':
main()
结果:
# Tensor shapes
[...]
Are equals? False
Contains same elements? True
Filter and batch: 0.5571189750007761
Batch and map: 15.582061060000342
善良
我在 TensorFlow 中有一些列数据,我想对其中一列进行过滤,如下所示:
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow.compat.v1 as tfv1
tfv1.enable_v2_behavior()
csv_file = tf.keras.utils.get_file('heart.csv', 'https://storage.googleapis.com/applied-dl/heart.csv')
df = pd.read_csv(csv_file)
target = df.pop('target')
df['thal'] = pd.Categorical(df['thal'])
df['thal'] = df.thal.cat.codes
# Use interleave() and prefetch() to read many files concurrently.
#files = tf.data.Dataset.list_files(file_pattern=input_file_pattern, shuffle=True, seed=123456789)
#dataset = files.interleave(lambda x: tf.data.RecordIODataset(x).prefetch(100), cycle_length=8)
#Pretend I actually had some data files
dataset = tf.data.Dataset.from_tensor_slices((df.to_dict('list'), target.values))
dataset = dataset.shuffle(1000, seed=123456789)
dataset = dataset.batch(20)
#Pretend I did some parsing here
# dataset = dataset.map(parse_record, num_parallel_calls=20)
dataset = dataset.filter(lambda x, label: x['trestbps']<135)
但这会产生错误消息:
ValueError:
predicate
return type must be convertible to a scalar boolean tensor. WasTensorSpec(shape=(None,), dtype=tf.bool, name=None)
.
我应该如何过滤数据?
这是因为您在 batch
之后应用了 filter
。
因此,在 lambda
表达式中,x
是形状为 (None,)
的批次(将 drop_reminder=True
传递给 batch
以获得 (20,)
的形状),而不是样本。要修复它,您必须在 batch
.
filter
在 batch
之后有一个 "filter" 的解决方案,使用 map
代替。然而,正如您所看到的,这会产生批量变量大小的副作用:您在输入中获得了一批 20,并且您删除了不符合特定条件的元素(trestbps < 135),而不是从中删除相同数量的元素每一批。此外,该解决方案的性能非常糟糕...
import timeit
import pandas as pd
import tensorflow.compat.v2 as tf
import tensorflow.compat.v1 as tfv1
tfv1.enable_v2_behavior()
def s1(ds):
dataset = ds
dataset = dataset.filter(lambda x, label: x['trestbps']<135)
dataset = dataset.batch(20)
return dataset
def s2(ds):
dataset = ds
dataset = dataset.batch(20)
dataset = dataset.map(lambda x, label: (tf.nest.map_structure(lambda y: y[x['trestbps'] < 135], x), label[x['trestbps'] < 135]))
return dataset
def base_ds():
csv_file = tf.keras.utils.get_file('heart.csv', 'https://storage.googleapis.com/applied-dl/heart.csv')
df = pd.read_csv(csv_file)
target = df.pop('target')
df['thal'] = pd.Categorical(df['thal'])
df['thal'] = df.thal.cat.codes
return tf.data.Dataset.from_tensor_slices((df.to_dict('list'), target.values))
def main():
ds = base_ds()
ds1 = s1(ds)
ds2 = s2(ds)
tf.print("DS_S1:", [tf.nest.map_structure(lambda x: x.shape, x) for x in ds1])
tf.print("DS_S2:", [tf.nest.map_structure(lambda x: x.shape, x) for x in ds2])
tf.print("Are equals?", [x for x in ds1] == [x for x in ds2])
tf.print("Contains same elements?", [x for x in ds1.unbatch()] == [x for x in ds2.unbatch()])
tf.print("Filter and batch:", timeit.timeit(lambda: s1(ds), number=100))
tf.print("Batch and map:", timeit.timeit(lambda: s2(ds), number=100))
if __name__ == '__main__':
main()
结果:
# Tensor shapes
[...]
Are equals? False
Contains same elements? True
Filter and batch: 0.5571189750007761
Batch and map: 15.582061060000342
善良