TensorFlow:整数张量到文件模式字符串

TensorFlow: Integer tensor to file pattern string

在数据集管道 (.map()) 中,我正在使用从 int 张量到 make_csv_dataset(...) 的文件模式字符串的转换。

我收到一个错误:

ValueError: No files match `file_pattern` dataset/PAMAP2_Dataset/train/*_Tensor("strided_slice:0", shape=(), dtype=int32).csv.

这是错误:Tensor("strided_slice:0", shape=(), dtype=int32) - 这应该是一个整数而不是这个文本.....

代码

labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]
def _make_dataset(idx):
    # shuffle
    activityID = tf.random.shuffle(labels)

    dataset = (
        tf.data.experimental.make_csv_dataset(
           file_pattern=("dataset/PAMAP2_Dataset/train/*_" + str(activityID[idx]) + ".csv"),
           batch_size=1,
           num_epochs=1,
           shuffle=False,
        ).batch(64, drop_remainder=True).shuffle(64)
        
dataset = (
    tf.data.Dataset.range(1)
    .interleave(
        map_func=_make_dataset,
        cycle_length=tf.data.AUTOTUNE,
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=False,
    )
)

有一个 bug 与使用 tf.data.Dataset.interleavemake_csv_dataset 有关。建议立即使用 CsvDataset API。用 tf.io.matching_files 尝试这样的事情来解决文件模式:

import pandas as pd
import tensorflow as tf

labels = [ 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 24 ]

data = {
  "id": [420, 380, 390],
  "duration": [50, 40, 45]
}

# Create dummy data
df = pd.DataFrame(data)

for i in labels:
  df.to_csv('test_{}.csv'.format(i), index=False)
  
def get_random_path(x, idx):
  return "/content/*_" + str(x[idx].numpy()) + ".csv"

def _make_dataset(idx):
    # shuffle
    activityID = tf.random.shuffle(labels)
    path = tf.py_function(get_random_path, [activityID, idx], Tout=[tf.string])

    dataset = tf.data.experimental.CsvDataset(
           filenames=tf.io.matching_files(path), record_defaults=[tf.int32, tf.int32], header=True)
    return dataset

dataset = (
    tf.data.Dataset.range(1)
    .interleave(_make_dataset,
        cycle_length=tf.data.AUTOTUNE,
        num_parallel_calls=tf.data.AUTOTUNE,
        deterministic=False,
    )
)

for x in dataset:
  print(x)
(<tf.Tensor: shape=(), dtype=int32, numpy=420>, <tf.Tensor: shape=(), dtype=int32, numpy=50>)
(<tf.Tensor: shape=(), dtype=int32, numpy=380>, <tf.Tensor: shape=(), dtype=int32, numpy=40>)
(<tf.Tensor: shape=(), dtype=int32, numpy=390>, <tf.Tensor: shape=(), dtype=int32, numpy=45>)

有关详细信息,请查看 docs

更新 1:

print(tf.io.matching_files("/content/*csv"))
tf.Tensor(
[b'/content/test_1.csv' b'/content/test_10.csv' b'/content/test_11.csv'
 b'/content/test_12.csv' b'/content/test_13.csv' b'/content/test_16.csv'
 b'/content/test_17.csv' b'/content/test_18.csv' b'/content/test_19.csv'
 b'/content/test_2.csv' b'/content/test_20.csv' b'/content/test_24.csv'
 b'/content/test_3.csv' b'/content/test_4.csv' b'/content/test_5.csv'
 b'/content/test_6.csv' b'/content/test_7.csv' b'/content/test_9.csv'], shape=(18,), dtype=string)