如何使用 Dataset API 读取变长列表的 TFRecords 文件?
How to use Dataset API to read TFRecords file of lists of variant length?
我想使用 Tensorflow 的数据集 API 来读取变长列表的 TFRecords 文件。这是我的代码。
def _int64_feature(value):
# value must be a numpy array.
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def main1():
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177],
[0, 50, 89, 147, 196],
[0, 38, 79, 157],
[0, 49, 89, 147, 177],
[0, 32, 73, 145]])
writer = tf.python_io.TFRecordWriter('file')
for i in range(a.shape[0]): # i = 0 ~ 4
x_train = a[i]
feature = {'i': _int64_feature(np.array([i])), 'data': _int64_feature(x_train)}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
# Check TFRocord file.
record_iterator = tf.python_io.tf_record_iterator(path='file')
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
i = (example.features.feature['i'].int64_list.value)
data = (example.features.feature['data'].int64_list.value)
#data = np.fromstring(data_string, dtype=np.int64)
print(i, data)
# Use Dataset API to read the TFRecord file.
def _parse_function(example_proto):
keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),
'data':tf.FixedLenFeature([], tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(i.eval())
print(data.eval())
检查 TFRecord 文件
[0] [0, 54, 91, 153, 177]
[1] [0, 50, 89, 147, 196]
[2] [0, 38, 79, 157]
[3] [0, 49, 89, 147, 177]
[4] [0, 32, 73, 145]
但是当我尝试使用 Dataset API 读取 TFRecord 文件时出现以下错误。
tensorflow.python.framework.errors_impl.InvalidArgumentError: Name:
, Key: data, Index: 0. Number of int64 values != expected.
Values size: 5 but output shape: []
谢谢。
更新:
我尝试使用以下代码读取带有 Dataset API 的 TFRecord,但都失败了。
def _parse_function(example_proto):
keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
或
def _parse_function(example_proto):
keys_to_features = {'i' :tf.VarLenFeature(tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
错误:
Traceback (most recent call last): File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",
line 468, in make_tensor_proto
str_values = [compat.as_bytes(x) for x in proto_values] File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",
line 468, in
str_values = [compat.as_bytes(x) for x in proto_values] File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py",
line 65, in as_bytes
(bytes_or_text,)) TypeError: Expected binary or unicode string, got
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "2tfrecord.py", line 126, in
main1() File "2tfrecord.py", line 72, in main1
iterator = ds.map(_parse_function).make_one_shot_iterator() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",
line 712, in map
return MapDataset(self, map_func) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",
line 1385, in init
self._map_func.add_to_graph(ops.get_default_graph()) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",
line 486, in add_to_graph
self._create_definition_if_needed() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",
line 321, in _create_definition_if_needed
self._create_definition_if_needed_impl() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py",
line 338, in _create_definition_if_needed_impl
outputs = self._func(*inputs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",
line 1376, in tf_map_func
flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)] File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py",
line 1376, in
flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)] File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py",
line 836, in convert_to_tensor
as_ref=False) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py",
line 926, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py",
line 229, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py",
line 208, in constant
value, dtype=dtype, shape=shape, verify_shape=verify_shape)) File
"/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py",
line 472, in make_tensor_proto
"supported type." % (type(values), values)) TypeError: Failed to convert object of type to Tensor.
Contents:
SparseTensor(indices=Tensor("ParseSingleExample/Slice_Indices_i:0",
shape=(?, 1), dtype=int64),
values=Tensor("ParseSingleExample/ParseExample/ParseExample:3",
shape=(?,), dtype=int64),
dense_shape=Tensor("ParseSingleExample/Squeeze_Shape_i:0", shape=(1,),
dtype=int64)). Consider casting elements to a supported type.
Python版本:3.5.2
Tensorflow 版本:1.4.1
错误很简单。您的 data
不是 FixedLenFeature
,而是 VarLenFeature
。替换你的行:
'data':tf.FixedLenFeature([], tf.int64)}
和
'data':tf.VarLenFeature(tf.int64)}
此外,当您调用 print(i.eval())
和 print(data.eval())
时,您将调用迭代器两次。第一个 print
将打印 0
,但第二个将打印第二行的值 [ 0, 50, 89, 147, 196]
。您可以执行 print(sess.run([i, data]))
从同一行中获取 i
和 data
。
经过数小时的搜索和尝试,我相信答案已经浮出水面。下面是我的代码。
def _int64_feature(value):
# value must be a numpy array.
return tf.train.Feature(int64_list=tf.train.Int64List(value=value.flatten()))
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177],
[0, 50, 89, 147, 196],
[0, 38, 79, 157],
[0, 49, 89, 147, 177],
[0, 32, 73, 145]])
writer = tf.python_io.TFRecordWriter('file')
for i in range(a.shape[0]): # i = 0 ~ 4
x_train = np.array(a[i])
feature = {'i' : _int64_feature(np.array([i])),
'data': _int64_feature(x_train)}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
# Check TFRocord file.
record_iterator = tf.python_io.tf_record_iterator(path='file')
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
i = (example.features.feature['i'].int64_list.value)
data = (example.features.feature['data'].int64_list.value)
print(i, data)
# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
keys_to_features = {'i':tf.VarLenFeature(tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return tf.sparse_tensor_to_dense(parsed_features['i']), \
tf.sparse_tensor_to_dense(parsed_features['data'])
# Parse the record into tensors.
dataset = dataset.map(_parse_function)
# Shuffle the dataset
dataset = dataset.shuffle(buffer_size=1)
# Repeat the input indefinitly
dataset = dataset.repeat()
# Generate batches
dataset = dataset.batch(1)
# Create a one-shot iterator
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
print(sess.run([i, data]))
print(sess.run([i, data]))
有几点需要注意。
1. 这个问题很有帮助。
2. tf.VarLenFeature
会return SparseTensor,所以需要用tf.sparse_tensor_to_dense
转成dense张量。
3、我的代码里,parse_single_example()
不能换成parse_example()
,烦了我一天。我不知道为什么 parse_example()
不起作用。有知道原因的请赐教
我想使用 Tensorflow 的数据集 API 来读取变长列表的 TFRecords 文件。这是我的代码。
def _int64_feature(value):
# value must be a numpy array.
return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
def main1():
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177],
[0, 50, 89, 147, 196],
[0, 38, 79, 157],
[0, 49, 89, 147, 177],
[0, 32, 73, 145]])
writer = tf.python_io.TFRecordWriter('file')
for i in range(a.shape[0]): # i = 0 ~ 4
x_train = a[i]
feature = {'i': _int64_feature(np.array([i])), 'data': _int64_feature(x_train)}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
# Check TFRocord file.
record_iterator = tf.python_io.tf_record_iterator(path='file')
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
i = (example.features.feature['i'].int64_list.value)
data = (example.features.feature['data'].int64_list.value)
#data = np.fromstring(data_string, dtype=np.int64)
print(i, data)
# Use Dataset API to read the TFRecord file.
def _parse_function(example_proto):
keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),
'data':tf.FixedLenFeature([], tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(i.eval())
print(data.eval())
检查 TFRecord 文件
[0] [0, 54, 91, 153, 177]
[1] [0, 50, 89, 147, 196]
[2] [0, 38, 79, 157]
[3] [0, 49, 89, 147, 177]
[4] [0, 32, 73, 145]
但是当我尝试使用 Dataset API 读取 TFRecord 文件时出现以下错误。
tensorflow.python.framework.errors_impl.InvalidArgumentError: Name: , Key: data, Index: 0. Number of int64 values != expected. Values size: 5 but output shape: []
谢谢。
更新:
我尝试使用以下代码读取带有 Dataset API 的 TFRecord,但都失败了。
def _parse_function(example_proto):
keys_to_features = {'i' :tf.FixedLenFeature([], tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
或
def _parse_function(example_proto):
keys_to_features = {'i' :tf.VarLenFeature(tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return parsed_features['i'], parsed_features['data']
ds = tf.data.TFRecordDataset('file')
iterator = ds.map(_parse_function).make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
错误:
Traceback (most recent call last): File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 468, in make_tensor_proto str_values = [compat.as_bytes(x) for x in proto_values] File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 468, in str_values = [compat.as_bytes(x) for x in proto_values] File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py", line 65, in as_bytes (bytes_or_text,)) TypeError: Expected binary or unicode string, got
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "2tfrecord.py", line 126, in main1() File "2tfrecord.py", line 72, in main1 iterator = ds.map(_parse_function).make_one_shot_iterator() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 712, in map return MapDataset(self, map_func) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 1385, in init self._map_func.add_to_graph(ops.get_default_graph()) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py", line 486, in add_to_graph self._create_definition_if_needed() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py", line 321, in _create_definition_if_needed self._create_definition_if_needed_impl() File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/function.py", line 338, in _create_definition_if_needed_impl outputs = self._func(*inputs) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 1376, in tf_map_func flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)] File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/dataset_ops.py", line 1376, in flattened_ret = [ops.convert_to_tensor(t) for t in nest.flatten(ret)] File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 836, in convert_to_tensor as_ref=False) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 926, in internal_convert_to_tensor ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py", line 229, in _constant_tensor_conversion_function return constant(v, dtype=dtype, name=name) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/constant_op.py", line 208, in constant value, dtype=dtype, shape=shape, verify_shape=verify_shape)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/tensor_util.py", line 472, in make_tensor_proto "supported type." % (type(values), values)) TypeError: Failed to convert object of type to Tensor. Contents: SparseTensor(indices=Tensor("ParseSingleExample/Slice_Indices_i:0", shape=(?, 1), dtype=int64), values=Tensor("ParseSingleExample/ParseExample/ParseExample:3", shape=(?,), dtype=int64), dense_shape=Tensor("ParseSingleExample/Squeeze_Shape_i:0", shape=(1,), dtype=int64)). Consider casting elements to a supported type.
Python版本:3.5.2
Tensorflow 版本:1.4.1
错误很简单。您的 data
不是 FixedLenFeature
,而是 VarLenFeature
。替换你的行:
'data':tf.FixedLenFeature([], tf.int64)}
和
'data':tf.VarLenFeature(tf.int64)}
此外,当您调用 print(i.eval())
和 print(data.eval())
时,您将调用迭代器两次。第一个 print
将打印 0
,但第二个将打印第二行的值 [ 0, 50, 89, 147, 196]
。您可以执行 print(sess.run([i, data]))
从同一行中获取 i
和 data
。
经过数小时的搜索和尝试,我相信答案已经浮出水面。下面是我的代码。
def _int64_feature(value):
# value must be a numpy array.
return tf.train.Feature(int64_list=tf.train.Int64List(value=value.flatten()))
# Write an array to TFrecord.
# a is an array which contains lists of variant length.
a = np.array([[0, 54, 91, 153, 177],
[0, 50, 89, 147, 196],
[0, 38, 79, 157],
[0, 49, 89, 147, 177],
[0, 32, 73, 145]])
writer = tf.python_io.TFRecordWriter('file')
for i in range(a.shape[0]): # i = 0 ~ 4
x_train = np.array(a[i])
feature = {'i' : _int64_feature(np.array([i])),
'data': _int64_feature(x_train)}
# Create an example protocol buffer
example = tf.train.Example(features=tf.train.Features(feature=feature))
# Serialize to string and write on the file
writer.write(example.SerializeToString())
writer.close()
# Check TFRocord file.
record_iterator = tf.python_io.tf_record_iterator(path='file')
for string_record in record_iterator:
example = tf.train.Example()
example.ParseFromString(string_record)
i = (example.features.feature['i'].int64_list.value)
data = (example.features.feature['data'].int64_list.value)
print(i, data)
# Use Dataset API to read the TFRecord file.
filenames = ["file"]
dataset = tf.data.TFRecordDataset(filenames)
def _parse_function(example_proto):
keys_to_features = {'i':tf.VarLenFeature(tf.int64),
'data':tf.VarLenFeature(tf.int64)}
parsed_features = tf.parse_single_example(example_proto, keys_to_features)
return tf.sparse_tensor_to_dense(parsed_features['i']), \
tf.sparse_tensor_to_dense(parsed_features['data'])
# Parse the record into tensors.
dataset = dataset.map(_parse_function)
# Shuffle the dataset
dataset = dataset.shuffle(buffer_size=1)
# Repeat the input indefinitly
dataset = dataset.repeat()
# Generate batches
dataset = dataset.batch(1)
# Create a one-shot iterator
iterator = dataset.make_one_shot_iterator()
i, data = iterator.get_next()
with tf.Session() as sess:
print(sess.run([i, data]))
print(sess.run([i, data]))
print(sess.run([i, data]))
有几点需要注意。
1. 这个
2. tf.VarLenFeature
会return SparseTensor,所以需要用tf.sparse_tensor_to_dense
转成dense张量。
3、我的代码里,parse_single_example()
不能换成parse_example()
,烦了我一天。我不知道为什么 parse_example()
不起作用。有知道原因的请赐教