将 ZipDataSet 写入 TFRecord
Writing ZipDataSet to TFRecord
我正在尝试将压缩数据集写入 TFRecord 文件
tutorial,但我的情况不同,ZipDataSet中每个数据集的每个元素都是张量而不是标量。
本教程通过注释
解决了这种意外情况
Note: To stay simple, this example only uses scalar inputs. The simplest way to handle non-scalar features is to use tf.serialize_tensor to convert tensors to binary-strings. Strings are scalars in tensorflow. Use tf.parse_tensor to convert the binary-string back to a tensor.
但我收到的错误似乎表明 _bytes_feature 函数获取的是张量而不是字节。
import tensorflow as tf
import numpy as np
sess = tf.Session()
def _bytes_feature(value):
"""Returns a bytes_list from a string / byte."""
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def serialize_with_labels(a, b, c, d):
"""
Creates a tf.Example message ready to be written to a file.
"""
# Create a dictionary mapping the feature name to the tf.Example-compatible
# data type.
feature = {'a': _bytes_feature(a),
'b': _bytes_feature(b),
'c': _bytes_feature(c),
'd': _bytes_feature(d),
}
# Create a Features message using tf.train.Example.
example_proto = tf.train.Example(features=tf.train
.Features(feature=feature))
return example_proto.SerializeToString()
def tf_serialize_w_labels(a, b, c, d):
"""Map serialize_with_labels to tf.data.Dataset."""
tf_string = tf.py_func(serialize_with_labels,
(a, b, c, d),
tf.string)
return tf.reshape(tf_string, ())
# a is a [n,m,p] tensor
# b is a [n,m,p] tensor
# c is a [n,m,p] tensor
# d is a [n,1,1] tensor
zipped = tf.data.Dataset().from_tensor_slices((a,b,c,d))
# I have confirmed that each item of serial_tensors is a tuple
# of four bytestrings.
serial_tensors = zipped.map(tf.serialize_tensor)
# Each item of serialized_features_dataset is a single bytestring
serialized_features_dataset = serial_tensors.map(tf_serialize_w_labels)
writer = tf.contrib.data.TFRecordWriter('test_output')
writeop = writer.write(serialized_features_dataset)
sess.run(writeop)
是我正在尝试的代码的基本格式运行。它写了,但是当我读入 TFRecord 时,
def _parse_function(example_proto):
# Parse the input tf.Example proto using the dictionary below.
feature_description = {
'a': tf.FixedLenFeature([], tf.string, default_value=''),
'b': tf.FixedLenFeature([], tf.string, default_value=''),
'c': tf.FixedLenFeature([], tf.string, default_value=''),
'd': tf.FixedLenFeature([], tf.string, default_value='')
}
return tf.parse_single_example(example_proto, feature_description)
filenames = ['zipped_TFR']
raw_dataset = tf.data.TFRecordDataset(filenames)
parsed = raw_dataset.map(_parse_function)
parsed_it = parsed.make_one_shot_iterator()
# prints the first element of a
print(sess.run(tf.parse_tensor(parsed_it.get_next()['a'], out_type=tf.int32)))
#prints the first element of b
print(sess.run(tf.parse_tensor(parsed_it.get_next()['b'], out_type=tf.int32)))
#prints the first element of c
print(sess.run(tf.parse_tensor(parsed_it.get_next()['c'], out_type=tf.int32)))
#prints nothing
print(sess.run(tf.parse_tensor(parsed_it.get_next()['d'], out_type=tf.int32)))
这不是迭代器 运行 退出的问题,例如,我曾尝试在打印 a、b 或 c 之前打印 d,但什么也没得到,然后成功打印了 a在同一个会话中。
我正在使用 tensorflow-gpu 1.10 版,我暂时坚持使用它,这就是我使用
的原因
writer = tf.contrib.data.TFRecordWriter('test_output')
代替
writer = tf.data.experimental.TFRecordWriter('test_output')
编辑:这是有效的。
首先,我将 a、b、c 和 d 压平为形状 [n,-1]。然后我将 serialize_w_labels 更改为下面的代码(单独留下 tf_serialize_w_examples)。
def serialize_w_labels(a, b, c, d, n, m, p):
# The object we return
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
ex.context.feature["d"].int64_list.value.append(d)
ex.context.feature["n"].int64_list.value.append(n)
ex.context.feature["m"].int64_list.value.append(m)
ex.context.feature["p"].int64_list.value.append(p)
# Feature lists for the two sequential features of our example
fl_a = ex.feature_lists.feature_list["a"]
fl_b = ex.feature_lists.feature_list["b"]
fl_c = ex.feature_lists.feature_list["c"]
for _a, _b, _c in zip(a, b, c):
fl_a.feature.add().int64_list.value.append(_a)
fl_b.feature.add().int64_list.value.append(_b)
fl_c.feature.add().float_list.value.append(_c)
return ex.SerializeToString()
以下正确解析结果数据集的元素:
context_features = {
"d": tf.FixedLenFeature([], dtype=tf.int64),
"m": tf.FixedLenFeature([], dtype=tf.int64),
"n": tf.FixedLenFeature([], dtype=tf.int64),
"p": tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
"a": tf.FixedLenSequenceFeature([], dtype=tf.int64),
"b": tf.FixedLenSequenceFeature([], dtype=tf.int64),
"c": tf.FixedLenSequenceFeature([], dtype=tf.float32)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=ex,
context_features=context_features,
sequence_features=sequence_features
)
显然,您的 dtype 可能会有所不同。然后可以使用上下文特征来重塑扁平化的 a、b 和 c。
我认为您应该研究 tf.io.FixedLenSequenceFeature
,它应该允许您将一系列特征作为特征写入 TFRecord
文件。例如,它在 YouTube8M 数据集中用于存储一个特征,该特征对于每个视频都是一组帧,对于您拥有的每个帧 Tensor
。
文档:
https://www.tensorflow.org/api_docs/python/tf/io/FixedLenSequenceFeature
阅读示例:
https://github.com/google/youtube-8m/blob/2c94ed449737c886175a5fff1bfba7eadc4de5ac/readers.py
如果您想使用 tf.io.serialize_tensor
进行记录,那么您可以创建一个会话并评估张量,或者使用 .numpy()
方法。
评估方法
x = tf.constant([[[0, 1], [2, 3]],[[1, 1], [2, 2]]], dtype=tf.float32)
with tf.compat.v1.Session() as sess:
str_bytes = sess.run(tf.io.serialize_tensor(x))
numpy 方法
x = tf.constant([[[0, 1], [2, 3]],[[1, 1], [2, 2]]], dtype=tf.float32)
str_bytes = tf.io.serialize_tensor(x).numpy()
我正在尝试将压缩数据集写入 TFRecord 文件 tutorial,但我的情况不同,ZipDataSet中每个数据集的每个元素都是张量而不是标量。
本教程通过注释
解决了这种意外情况Note: To stay simple, this example only uses scalar inputs. The simplest way to handle non-scalar features is to use tf.serialize_tensor to convert tensors to binary-strings. Strings are scalars in tensorflow. Use tf.parse_tensor to convert the binary-string back to a tensor.
但我收到的错误似乎表明 _bytes_feature 函数获取的是张量而不是字节。
import tensorflow as tf
import numpy as np
sess = tf.Session()
def _bytes_feature(value):
"""Returns a bytes_list from a string / byte."""
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def serialize_with_labels(a, b, c, d):
"""
Creates a tf.Example message ready to be written to a file.
"""
# Create a dictionary mapping the feature name to the tf.Example-compatible
# data type.
feature = {'a': _bytes_feature(a),
'b': _bytes_feature(b),
'c': _bytes_feature(c),
'd': _bytes_feature(d),
}
# Create a Features message using tf.train.Example.
example_proto = tf.train.Example(features=tf.train
.Features(feature=feature))
return example_proto.SerializeToString()
def tf_serialize_w_labels(a, b, c, d):
"""Map serialize_with_labels to tf.data.Dataset."""
tf_string = tf.py_func(serialize_with_labels,
(a, b, c, d),
tf.string)
return tf.reshape(tf_string, ())
# a is a [n,m,p] tensor
# b is a [n,m,p] tensor
# c is a [n,m,p] tensor
# d is a [n,1,1] tensor
zipped = tf.data.Dataset().from_tensor_slices((a,b,c,d))
# I have confirmed that each item of serial_tensors is a tuple
# of four bytestrings.
serial_tensors = zipped.map(tf.serialize_tensor)
# Each item of serialized_features_dataset is a single bytestring
serialized_features_dataset = serial_tensors.map(tf_serialize_w_labels)
writer = tf.contrib.data.TFRecordWriter('test_output')
writeop = writer.write(serialized_features_dataset)
sess.run(writeop)
是我正在尝试的代码的基本格式运行。它写了,但是当我读入 TFRecord 时,
def _parse_function(example_proto):
# Parse the input tf.Example proto using the dictionary below.
feature_description = {
'a': tf.FixedLenFeature([], tf.string, default_value=''),
'b': tf.FixedLenFeature([], tf.string, default_value=''),
'c': tf.FixedLenFeature([], tf.string, default_value=''),
'd': tf.FixedLenFeature([], tf.string, default_value='')
}
return tf.parse_single_example(example_proto, feature_description)
filenames = ['zipped_TFR']
raw_dataset = tf.data.TFRecordDataset(filenames)
parsed = raw_dataset.map(_parse_function)
parsed_it = parsed.make_one_shot_iterator()
# prints the first element of a
print(sess.run(tf.parse_tensor(parsed_it.get_next()['a'], out_type=tf.int32)))
#prints the first element of b
print(sess.run(tf.parse_tensor(parsed_it.get_next()['b'], out_type=tf.int32)))
#prints the first element of c
print(sess.run(tf.parse_tensor(parsed_it.get_next()['c'], out_type=tf.int32)))
#prints nothing
print(sess.run(tf.parse_tensor(parsed_it.get_next()['d'], out_type=tf.int32)))
这不是迭代器 运行 退出的问题,例如,我曾尝试在打印 a、b 或 c 之前打印 d,但什么也没得到,然后成功打印了 a在同一个会话中。
我正在使用 tensorflow-gpu 1.10 版,我暂时坚持使用它,这就是我使用
的原因writer = tf.contrib.data.TFRecordWriter('test_output')
代替
writer = tf.data.experimental.TFRecordWriter('test_output')
编辑:这是有效的。
首先,我将 a、b、c 和 d 压平为形状 [n,-1]。然后我将 serialize_w_labels 更改为下面的代码(单独留下 tf_serialize_w_examples)。
def serialize_w_labels(a, b, c, d, n, m, p):
# The object we return
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
ex.context.feature["d"].int64_list.value.append(d)
ex.context.feature["n"].int64_list.value.append(n)
ex.context.feature["m"].int64_list.value.append(m)
ex.context.feature["p"].int64_list.value.append(p)
# Feature lists for the two sequential features of our example
fl_a = ex.feature_lists.feature_list["a"]
fl_b = ex.feature_lists.feature_list["b"]
fl_c = ex.feature_lists.feature_list["c"]
for _a, _b, _c in zip(a, b, c):
fl_a.feature.add().int64_list.value.append(_a)
fl_b.feature.add().int64_list.value.append(_b)
fl_c.feature.add().float_list.value.append(_c)
return ex.SerializeToString()
以下正确解析结果数据集的元素:
context_features = {
"d": tf.FixedLenFeature([], dtype=tf.int64),
"m": tf.FixedLenFeature([], dtype=tf.int64),
"n": tf.FixedLenFeature([], dtype=tf.int64),
"p": tf.FixedLenFeature([], dtype=tf.int64)
}
sequence_features = {
"a": tf.FixedLenSequenceFeature([], dtype=tf.int64),
"b": tf.FixedLenSequenceFeature([], dtype=tf.int64),
"c": tf.FixedLenSequenceFeature([], dtype=tf.float32)
}
context_parsed, sequence_parsed = tf.parse_single_sequence_example(
serialized=ex,
context_features=context_features,
sequence_features=sequence_features
)
显然,您的 dtype 可能会有所不同。然后可以使用上下文特征来重塑扁平化的 a、b 和 c。
我认为您应该研究 tf.io.FixedLenSequenceFeature
,它应该允许您将一系列特征作为特征写入 TFRecord
文件。例如,它在 YouTube8M 数据集中用于存储一个特征,该特征对于每个视频都是一组帧,对于您拥有的每个帧 Tensor
。
文档: https://www.tensorflow.org/api_docs/python/tf/io/FixedLenSequenceFeature
阅读示例: https://github.com/google/youtube-8m/blob/2c94ed449737c886175a5fff1bfba7eadc4de5ac/readers.py
如果您想使用 tf.io.serialize_tensor
进行记录,那么您可以创建一个会话并评估张量,或者使用 .numpy()
方法。
评估方法
x = tf.constant([[[0, 1], [2, 3]],[[1, 1], [2, 2]]], dtype=tf.float32)
with tf.compat.v1.Session() as sess:
str_bytes = sess.run(tf.io.serialize_tensor(x))
numpy 方法
x = tf.constant([[[0, 1], [2, 3]],[[1, 1], [2, 2]]], dtype=tf.float32)
str_bytes = tf.io.serialize_tensor(x).numpy()