将张量流数据集转换为包含字符串的 python 列表
Convert a tensorflow dataset to a python list with strings
考虑以下代码:
import numpy as np
import tensorflow as tf
simple_data_samples = np.array([
[1, 1, 1, -1, -1],
[2, 2, 2, -2, -2],
[3, 3, 3, -3, -3],
[4, 4, 4, -4, -4],
[5, 5, 5, -5, -5],
[6, 6, 6, -6, -6],
[7, 7, 7, -7, -7],
[8, 8, 8, -8, -8],
[9, 9, 9, -9, -9],
[10, 10, 10, -10, -10],
[11, 11, 11, -11, -11],
[12, 12, 12, -12, -12],
])
def timeseries_dataset_multistep_combined(features, label_slice, input_sequence_length, output_sequence_length, batch_size):
feature_ds = tf.keras.preprocessing.timeseries_dataset_from_array(features, None, input_sequence_length + output_sequence_length, batch_size=batch_size)
def split_feature_label(x):
x=tf.strings.as_string(x)
return x[:, :input_sequence_length, :], x[:, input_sequence_length:, label_slice]
feature_ds = feature_ds.map(split_feature_label)
return feature_ds
ds = timeseries_dataset_multistep_combined(simple_data_samples, slice(None, None, None), input_sequence_length=4, output_sequence_length=2,
batch_size=1)
def print_dataset(ds):
for inputs, targets in ds:
print("---Batch---")
print("Feature:", inputs.numpy())
print("Label:", targets.numpy())
print("")
print_dataset(ds)
张量流数据集“ds”由输入和目标组成。
现在我想将 tensorflow 数据集转换为具有以下属性的 python 列表:
Index Type Size Value
0 str 13 1 2 3 4 5 6
1 str 13 1 2 3 4 5 6
2 str 13 1 2 3 4 5 6
3 str 13 -1 -2 -3 -4 -5 -6
4 str 13 -1 -2 -3 -4 -5 -6
5 str 13 2 3 4 5 6 7
.... and so on
在上面的例子中,我们假设创建了一个包含字符串的 python 列表。在“值”字段中,您可以在左侧看到张量流数据集的输入(例如 1 2 3 4,字符串之间有一个空格),在右侧您可以看到相应的目标(例如 5 6 和字符串之间的空格)。需要注意的是,输入和目标之间有一个水平制表符“\t”(例如 1 2 3 4.\t5 6.)
我该如何编码?
我用了你的print_dataset功能。
def print_dataset(ds):
list_sets = []
for input, targets in ds:
input = np.transpose(np.array(inputs)[0])
label = np.transpose(np.array(targets)[0])
for input_set, label_set in zip(input, label):
set = ""
set = "".join(str(value).replace("b'", "").replace("'", "") + " " for value in input_set)
set += "\t" # add the tab
set += "".join(str(value).replace("b'", "").replace("'", "") + " " for value in label_set)
set = set[:-1] # remove the trailing white space
# print(set) #prints each line individually
list_sets.append(set)
print(list_sets) # prints the whole list
忽略你可以看到“\t”而不是带有 spaces 的制表符,如果你打印每行都工作正常。 Python 仅打印“\t”以通过将无用的 space 替换为快捷方式来缩短长度。
如果你想要一个 pandas
数据框,你可以尝试这样的事情:
features = np.concatenate(list(ds.map(lambda x, y: tf.transpose(tf.squeeze(x, axis=0)))))
targets = np.concatenate(list(ds.map(lambda x, y: tf.transpose(tf.squeeze(y, axis=0)))))
values = list(map(lambda x: x[0]+ "\t" + x[1], zip([" ".join(item) for item in features.astype(str)],
[" ".join(item) for item in targets.astype(str)])))
types = [type(v).__name__ for v in values]
sizes = [len(v) for v in values]
df = pd.DataFrame({'Size':sizes, 'Type':types, 'Value':values})
df.index.name = 'Index'
print(df.head())
考虑以下代码:
import numpy as np
import tensorflow as tf
simple_data_samples = np.array([
[1, 1, 1, -1, -1],
[2, 2, 2, -2, -2],
[3, 3, 3, -3, -3],
[4, 4, 4, -4, -4],
[5, 5, 5, -5, -5],
[6, 6, 6, -6, -6],
[7, 7, 7, -7, -7],
[8, 8, 8, -8, -8],
[9, 9, 9, -9, -9],
[10, 10, 10, -10, -10],
[11, 11, 11, -11, -11],
[12, 12, 12, -12, -12],
])
def timeseries_dataset_multistep_combined(features, label_slice, input_sequence_length, output_sequence_length, batch_size):
feature_ds = tf.keras.preprocessing.timeseries_dataset_from_array(features, None, input_sequence_length + output_sequence_length, batch_size=batch_size)
def split_feature_label(x):
x=tf.strings.as_string(x)
return x[:, :input_sequence_length, :], x[:, input_sequence_length:, label_slice]
feature_ds = feature_ds.map(split_feature_label)
return feature_ds
ds = timeseries_dataset_multistep_combined(simple_data_samples, slice(None, None, None), input_sequence_length=4, output_sequence_length=2,
batch_size=1)
def print_dataset(ds):
for inputs, targets in ds:
print("---Batch---")
print("Feature:", inputs.numpy())
print("Label:", targets.numpy())
print("")
print_dataset(ds)
张量流数据集“ds”由输入和目标组成。 现在我想将 tensorflow 数据集转换为具有以下属性的 python 列表:
Index Type Size Value
0 str 13 1 2 3 4 5 6
1 str 13 1 2 3 4 5 6
2 str 13 1 2 3 4 5 6
3 str 13 -1 -2 -3 -4 -5 -6
4 str 13 -1 -2 -3 -4 -5 -6
5 str 13 2 3 4 5 6 7
.... and so on
在上面的例子中,我们假设创建了一个包含字符串的 python 列表。在“值”字段中,您可以在左侧看到张量流数据集的输入(例如 1 2 3 4,字符串之间有一个空格),在右侧您可以看到相应的目标(例如 5 6 和字符串之间的空格)。需要注意的是,输入和目标之间有一个水平制表符“\t”(例如 1 2 3 4.\t5 6.)
我该如何编码?
我用了你的print_dataset功能。
def print_dataset(ds):
list_sets = []
for input, targets in ds:
input = np.transpose(np.array(inputs)[0])
label = np.transpose(np.array(targets)[0])
for input_set, label_set in zip(input, label):
set = ""
set = "".join(str(value).replace("b'", "").replace("'", "") + " " for value in input_set)
set += "\t" # add the tab
set += "".join(str(value).replace("b'", "").replace("'", "") + " " for value in label_set)
set = set[:-1] # remove the trailing white space
# print(set) #prints each line individually
list_sets.append(set)
print(list_sets) # prints the whole list
忽略你可以看到“\t”而不是带有 spaces 的制表符,如果你打印每行都工作正常。 Python 仅打印“\t”以通过将无用的 space 替换为快捷方式来缩短长度。
如果你想要一个 pandas
数据框,你可以尝试这样的事情:
features = np.concatenate(list(ds.map(lambda x, y: tf.transpose(tf.squeeze(x, axis=0)))))
targets = np.concatenate(list(ds.map(lambda x, y: tf.transpose(tf.squeeze(y, axis=0)))))
values = list(map(lambda x: x[0]+ "\t" + x[1], zip([" ".join(item) for item in features.astype(str)],
[" ".join(item) for item in targets.astype(str)])))
types = [type(v).__name__ for v in values]
sizes = [len(v) for v in values]
df = pd.DataFrame({'Size':sizes, 'Type':types, 'Value':values})
df.index.name = 'Index'
print(df.head())