在图形执行模式下拆分 tensorflow tf.data 数据集的示例

Split examples of a tensorflow tf.data dataset in graph execution mode

目标

我有一个 tf.data.Dataset,其中一些示例太长(0 轴的大小太大)。我想把这些过长的例子分成几个例子,每个例子都是原始例子的一部分。如果一个特定的例子不能被所需的块大小整除,我想 t运行 计算余数。

例如,如果原始数据集的 numpy 视图如下所示(5 个元素):

>>> print(list(dataset.as_numpy_iterator()))
[array([25], dtype=int32),
 array([ 6, 91], dtype=int32),
 array([15, 30, 96], dtype=int32),
 array([14, 45, 27, 72], dtype=int32),
 array([ 7, 75, 89, 47, 66], dtype=int32)]

并且所需的块大小为 2,因此我希望新数据集如下(7 个元素):

>>> new_dataset = chunk_dataset(dataset, chunk_size=2)
>>> print(list(new_dataset.as_numpy_iterator()))
[array([25], dtype=int32),
 array([ 6, 91], dtype=int32),
 array([15, 30], dtype=int32),
 array([14, 45], dtype=int32),
 array([27, 72], dtype=int32),
 array([7, 75], dtype=int32)]
 array([89, 47], dtype=int32)]

问题

我无法编写与 tf.data.Dataset 一起工作的分块函数,其中所有操作 运行 都处于图形模式(与急切执行相反)。根据我尝试的确切分块功能,我 运行 遇到了不同的错误。

请注意,我确实知道如何在图形模式之外实现这一点,例如在 numpy 中或使用 tf eager execution。我想将其写成 tf.data.Dataset 操作,以便高效地预处理我的示例。

代码

另请参阅 this Colab notebook 以重现我的问题。

import tensorflow as tf
import numpy as np

from typing import List, Callable

"""## Code for chunking"""

def chunk_tensor_v1(input_tensor: tf.Tensor,
                    chunk_size: int) -> List[tf.Tensor]:

    tensor_chunks = []  # type: List[tf.Tensor]

    while tf.shape(input_tensor)[0] >= chunk_size:
        chunk = input_tensor[:chunk_size]
        tensor_chunks.append(chunk)
        input_tensor = input_tensor[chunk_size:]

    return tensor_chunks

def chunk_tensor_v2(input_tensor: tf.Tensor,
                    chunk_size: int) -> List[tf.Tensor]:

    frames = input_tensor.shape[0]

    if frames > chunk_size:
        remainder = frames % chunk_size
    else:
        remainder = 0

    if remainder != 0:
        input_tensor = input_tensor[:-remainder]

    num_splits = max(frames // chunk_size, 1)

    return tf.split(input_tensor, num_splits, axis=0)

def chunk_example(example: tf.Tensor,
                  chunk_size: int,
                  chunking_function: Callable) -> tf.data.Dataset:

    tensor_chunks = chunking_function(example, chunk_size=chunk_size)

    return tf.data.Dataset.from_tensor_slices(tensor_chunks)

def chunk_dataset(dataset: tf.data.Dataset, chunk_size: int, chunking_function: Callable) -> tf.data.Dataset:

    dataset = dataset.map(lambda example: chunk_example(example=example, chunk_size=chunk_size, chunking_function=chunking_function))
    dataset = dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)

    return dataset

"""## Code to create a dummy dataset"""

def create_dataset_with_single_example(size: int):
  t = tf.random.uniform((size,), minval=0, maxval=100, dtype=tf.dtypes.int32)
  d = tf.data.Dataset.from_tensors(t)

  return d

def create_dataset(num_examples: int) -> tf.data.Dataset:
  examples = [create_dataset_with_single_example(n + 1) for n in range(num_examples)]

  dataset = tf.data.Dataset.from_tensor_slices(examples)
  dataset = dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)

  return dataset

"""## Testing the chunking code with the dummy dataset"""

num_examples = 5

dataset = create_dataset(num_examples)

print(list(dataset.as_numpy_iterator()))

chunk_dataset(dataset, chunk_size=2, chunking_function=chunk_tensor_v1)

chunk_dataset(dataset, chunk_size=2, chunking_function=chunk_tensor_v2)

错误

使用chunk_tensor_v1导致

InaccessibleTensorError: tf.Graph captured an external symbolic tensor. The symbolic tensor <tf.Tensor 'while/strided_slice:0' shape=(None,) dtype=int32> is captured by FuncGraph(name=Dataset_map_lambda, id=140570786598224), but it is defined at FuncGraph(name=while_body_485049, id=140570787725264). A tf.Graph is not allowed to capture symoblic tensors from another graph. Use return values, explicit Python locals or TensorFlow collections to access it. Please see https://www.tensorflow.org/guide/function#all_outputs_of_a_tffunction_must_be_return_values for more information.

chunk_tensor_v2导致

TypeError: '>' not supported between instances of 'NoneType' and 'int'

如果有人知道如何进一步简化我的问题,我很乐意编辑问题。

有点棘手,但绝对有可能!你可以尝试这样的事情:

核心部分代码(大概可以简化一下):

dataset1 = dataset.filter(lambda x: tf.less_equal(tf.shape(x)[0], chunk_size))
dataset2 = dataset.filter(lambda x: tf.greater(tf.shape(x)[0], chunk_size))

def body(i, m, n):
  n = n.write(n.size(), m[i:i+chunk_size])
  return tf.add(i,chunk_size), m, n 

def split_data(data, chunk_size):
    length = tf.shape(data)[0]
    x = data[:(length // chunk_size) * chunk_size]
    ta = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
    i0 = tf.constant(0)
    c = lambda i, m, n: tf.less(i, tf.shape(x)[0] - 1)
    _, _, out = tf.while_loop(c, body, loop_vars=[i0, x, ta])
    return out.stack()

dataset2 = dataset2.map(lambda x: split_data(x, chunk_size))
dataset2 = dataset2.flat_map(tf.data.Dataset.from_tensor_slices)
dataset = dataset1.concatenate(dataset2)

整个代码:

import tensorflow as tf
tf.random.set_seed(456)

def create_dataset_with_single_example(size: int):
  t = tf.random.uniform((size,), minval=0, maxval=100, dtype=tf.dtypes.int32)
  d = tf.data.Dataset.from_tensors(t)

  return d

def create_dataset(num_examples: int) -> tf.data.Dataset:
  examples = [create_dataset_with_single_example(n + 1) for n in range(num_examples)]

  dataset = tf.data.Dataset.from_tensor_slices(examples)
  dataset = dataset.interleave(lambda x: x, cycle_length=1, num_parallel_calls=tf.data.AUTOTUNE)

  return dataset

num_examples = 5
chunk_size = 2
dataset = create_dataset(num_examples)
print('Before --> \n')
for d in dataset:
  print(d)

dataset1 = dataset.filter(lambda x: tf.less_equal(tf.shape(x)[0], chunk_size))
dataset2 = dataset.filter(lambda x: tf.greater(tf.shape(x)[0], chunk_size))

def body(i, m, n):
  n = n.write(n.size(), m[i:i+chunk_size])
  return tf.add(i,chunk_size), m, n 

def split_data(data, chunk_size):
    length = tf.shape(data)[0]
    x = data[:(length // chunk_size) * chunk_size]
    ta = tf.TensorArray(dtype=tf.int32, size=0, dynamic_size=True)
    i0 = tf.constant(0)
    c = lambda i, m, n: tf.less(i, tf.shape(x)[0] - 1)
    _, _, out = tf.while_loop(c, body, loop_vars=[i0, x, ta])
    return out.stack()

dataset2 = dataset2.map(lambda x: split_data(x, chunk_size))
dataset2 = dataset2.flat_map(tf.data.Dataset.from_tensor_slices)
dataset = dataset1.concatenate(dataset2)

print('\nAfter --> \n')
for d in dataset:
  print(d)
Before --> 

tf.Tensor([44], shape=(1,), dtype=int32)
tf.Tensor([23 10], shape=(2,), dtype=int32)
tf.Tensor([41 86  2], shape=(3,), dtype=int32)
tf.Tensor([54 78 20 93], shape=(4,), dtype=int32)
tf.Tensor([51 87 96 84 31], shape=(5,), dtype=int32)

After --> 

tf.Tensor([44], shape=(1,), dtype=int32)
tf.Tensor([23 10], shape=(2,), dtype=int32)
tf.Tensor([41 86], shape=(2,), dtype=int32)
tf.Tensor([54 78], shape=(2,), dtype=int32)
tf.Tensor([20 93], shape=(2,), dtype=int32)
tf.Tensor([51 87], shape=(2,), dtype=int32)
tf.Tensor([96 84], shape=(2,), dtype=int32)

chunk_size = 3:

Before --> 

tf.Tensor([44], shape=(1,), dtype=int32)
tf.Tensor([23 10], shape=(2,), dtype=int32)
tf.Tensor([41 86  2], shape=(3,), dtype=int32)
tf.Tensor([54 78 20 93], shape=(4,), dtype=int32)
tf.Tensor([51 87 96 84 31], shape=(5,), dtype=int32)

After --> 

tf.Tensor([44], shape=(1,), dtype=int32)
tf.Tensor([23 10], shape=(2,), dtype=int32)
tf.Tensor([41 86  2], shape=(3,), dtype=int32)
tf.Tensor([54 78 20], shape=(3,), dtype=int32)
tf.Tensor([51 87 96], shape=(3,), dtype=int32)

chunk_size = 4:

Before --> 

tf.Tensor([44], shape=(1,), dtype=int32)
tf.Tensor([23 10], shape=(2,), dtype=int32)
tf.Tensor([41 86  2], shape=(3,), dtype=int32)
tf.Tensor([54 78 20 93], shape=(4,), dtype=int32)
tf.Tensor([51 87 96 84 31], shape=(5,), dtype=int32)

After --> 

tf.Tensor([44], shape=(1,), dtype=int32)
tf.Tensor([23 10], shape=(2,), dtype=int32)
tf.Tensor([41 86  2], shape=(3,), dtype=int32)
tf.Tensor([54 78 20 93], shape=(4,), dtype=int32)
tf.Tensor([51 87 96 84], shape=(4,), dtype=int32)