TensorFlow:整数张量到文件模式字符串
TensorFlow: Integer tensor to file pattern string
在数据集管道 (.map()
) 中,我正在使用从 int 张量到 make_csv_dataset(...)
的文件模式字符串的转换。
我收到一个错误:
ValueError: No files match `file_pattern` dataset/PAMAP2_Dataset/train/*_Tensor("strided_slice:0", shape=(), dtype=int32).csv.
这是错误:Tensor("strided_slice:0", shape=(), dtype=int32)
- 这应该是一个整数而不是这个文本.....
代码
labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]
def _make_dataset(idx):
# shuffle
activityID = tf.random.shuffle(labels)
dataset = (
tf.data.experimental.make_csv_dataset(
file_pattern=("dataset/PAMAP2_Dataset/train/*_" + str(activityID[idx]) + ".csv"),
batch_size=1,
num_epochs=1,
shuffle=False,
).batch(64, drop_remainder=True).shuffle(64)
dataset = (
tf.data.Dataset.range(1)
.interleave(
map_func=_make_dataset,
cycle_length=tf.data.AUTOTUNE,
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=False,
)
)
有一个 bug 与使用 tf.data.Dataset.interleave
和 make_csv_dataset
有关。建议立即使用 CsvDataset
API。用 tf.io.matching_files
尝试这样的事情来解决文件模式:
import pandas as pd
import tensorflow as tf
labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]
data = {
"id": [420, 380, 390],
"duration": [50, 40, 45]
}
# Create dummy data
df = pd.DataFrame(data)
for i in labels:
df.to_csv('test_{}.csv'.format(i), index=False)
def get_random_path(x, idx):
return "/content/*_" + str(x[idx].numpy()) + ".csv"
def _make_dataset(idx):
# shuffle
activityID = tf.random.shuffle(labels)
path = tf.py_function(get_random_path, [activityID, idx], Tout=[tf.string])
dataset = tf.data.experimental.CsvDataset(
filenames=tf.io.matching_files(path), record_defaults=[tf.int32, tf.int32], header=True)
return dataset
dataset = (
tf.data.Dataset.range(1)
.interleave(_make_dataset,
cycle_length=tf.data.AUTOTUNE,
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=False,
)
)
for x in dataset:
print(x)
(<tf.Tensor: shape=(), dtype=int32, numpy=420>, <tf.Tensor: shape=(), dtype=int32, numpy=50>)
(<tf.Tensor: shape=(), dtype=int32, numpy=380>, <tf.Tensor: shape=(), dtype=int32, numpy=40>)
(<tf.Tensor: shape=(), dtype=int32, numpy=390>, <tf.Tensor: shape=(), dtype=int32, numpy=45>)
有关详细信息,请查看 docs。
更新 1:
print(tf.io.matching_files("/content/*csv"))
tf.Tensor(
[b'/content/test_1.csv' b'/content/test_10.csv' b'/content/test_11.csv'
b'/content/test_12.csv' b'/content/test_13.csv' b'/content/test_16.csv'
b'/content/test_17.csv' b'/content/test_18.csv' b'/content/test_19.csv'
b'/content/test_2.csv' b'/content/test_20.csv' b'/content/test_24.csv'
b'/content/test_3.csv' b'/content/test_4.csv' b'/content/test_5.csv'
b'/content/test_6.csv' b'/content/test_7.csv' b'/content/test_9.csv'], shape=(18,), dtype=string)
在数据集管道 (.map()
) 中,我正在使用从 int 张量到 make_csv_dataset(...)
的文件模式字符串的转换。
我收到一个错误:
ValueError: No files match `file_pattern` dataset/PAMAP2_Dataset/train/*_Tensor("strided_slice:0", shape=(), dtype=int32).csv.
这是错误:Tensor("strided_slice:0", shape=(), dtype=int32)
- 这应该是一个整数而不是这个文本.....
代码
labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]
def _make_dataset(idx):
# shuffle
activityID = tf.random.shuffle(labels)
dataset = (
tf.data.experimental.make_csv_dataset(
file_pattern=("dataset/PAMAP2_Dataset/train/*_" + str(activityID[idx]) + ".csv"),
batch_size=1,
num_epochs=1,
shuffle=False,
).batch(64, drop_remainder=True).shuffle(64)
dataset = (
tf.data.Dataset.range(1)
.interleave(
map_func=_make_dataset,
cycle_length=tf.data.AUTOTUNE,
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=False,
)
)
有一个 bug 与使用 tf.data.Dataset.interleave
和 make_csv_dataset
有关。建议立即使用 CsvDataset
API。用 tf.io.matching_files
尝试这样的事情来解决文件模式:
import pandas as pd
import tensorflow as tf
labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]
data = {
"id": [420, 380, 390],
"duration": [50, 40, 45]
}
# Create dummy data
df = pd.DataFrame(data)
for i in labels:
df.to_csv('test_{}.csv'.format(i), index=False)
def get_random_path(x, idx):
return "/content/*_" + str(x[idx].numpy()) + ".csv"
def _make_dataset(idx):
# shuffle
activityID = tf.random.shuffle(labels)
path = tf.py_function(get_random_path, [activityID, idx], Tout=[tf.string])
dataset = tf.data.experimental.CsvDataset(
filenames=tf.io.matching_files(path), record_defaults=[tf.int32, tf.int32], header=True)
return dataset
dataset = (
tf.data.Dataset.range(1)
.interleave(_make_dataset,
cycle_length=tf.data.AUTOTUNE,
num_parallel_calls=tf.data.AUTOTUNE,
deterministic=False,
)
)
for x in dataset:
print(x)
(<tf.Tensor: shape=(), dtype=int32, numpy=420>, <tf.Tensor: shape=(), dtype=int32, numpy=50>)
(<tf.Tensor: shape=(), dtype=int32, numpy=380>, <tf.Tensor: shape=(), dtype=int32, numpy=40>)
(<tf.Tensor: shape=(), dtype=int32, numpy=390>, <tf.Tensor: shape=(), dtype=int32, numpy=45>)
有关详细信息,请查看 docs。
更新 1:
print(tf.io.matching_files("/content/*csv"))
tf.Tensor(
[b'/content/test_1.csv' b'/content/test_10.csv' b'/content/test_11.csv'
b'/content/test_12.csv' b'/content/test_13.csv' b'/content/test_16.csv'
b'/content/test_17.csv' b'/content/test_18.csv' b'/content/test_19.csv'
b'/content/test_2.csv' b'/content/test_20.csv' b'/content/test_24.csv'
b'/content/test_3.csv' b'/content/test_4.csv' b'/content/test_5.csv'
b'/content/test_6.csv' b'/content/test_7.csv' b'/content/test_9.csv'], shape=(18,), dtype=string)