tensorflow2 按列名将数据集拆分为两个数据集
tensorflow2 split dataset into two datasets by column names
我想将 tensorflow2 数据集分成两个数据集,一个包含“特征”,另一个包含“标签”。
在数据集中,每个元素都是一个python字典:
{'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}
{'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}
该数据集为预取数据集,从1000个txt(.gz)文件中加载。
每个文件的大小为 500KB-600KB。
无法将所有文件数据加载到内存中,因此我必须批量预取它(大小为 200)。
def a_func(file_paths, batch_size=200):
dataset = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(batch_size)
#split the dataset into features and labels ?
label = dataset.map(lambda x: col for col in x if col=='label') # error: "x" not defined
features = dataset.map(lambda x: col for col in x if col!='label')
return features, label
如何将数据集按列名拆分为两个数据集?
更新
ds = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
type(ds)
# tensorflow.python.data.ops.readers.TFRecordDatasetV2
label = ds.map(lambda x: x['label'])
# TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'label'
for row in ds:
print(row.numpy()) # show a lot of hex chars
IIUC,你可以这样试试:
import tensorflow as tf
# Create dummy data
ds1 = tf.data.Dataset.from_tensors(({'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}))
ds2 = tf.data.Dataset.from_tensors(({'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}))
ds = ds1.concatenate(ds2)
label = ds.map(lambda x: x['label'])
features = ds.map(lambda x: (x['id'], x['val1'], x['val2'], x['val3']))
for l in label:
print(l)
for f in features:
print(f)
tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)
(<tf.Tensor: shape=(), dtype=int32, numpy=11>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.22, 0.36], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([81], dtype=int32)>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'swimming', b'running', b'jumpoing'], dtype=object)>)
(<tf.Tensor: shape=(), dtype=int32, numpy=29>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.97, 0.52], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([627], dtype=int32)>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'baseball', b'football', b'basketball'], dtype=object)>)
或使用动态键:
import tensorflow as tf
import numpy as np
# Create dummy data
ds1 = tf.data.Dataset.from_tensors(({'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}))
ds2 = tf.data.Dataset.from_tensors(({'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}))
ds = ds1.concatenate(ds2)
label = ds.map(lambda x: x['label'])
features = ds.map(lambda x: (list(map(x.get, list(np.setdiff1d(list(x.keys()),['label']))))))
我想将 tensorflow2 数据集分成两个数据集,一个包含“特征”,另一个包含“标签”。
在数据集中,每个元素都是一个python字典:
{'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}
{'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}
该数据集为预取数据集,从1000个txt(.gz)文件中加载。 每个文件的大小为 500KB-600KB。 无法将所有文件数据加载到内存中,因此我必须批量预取它(大小为 200)。
def a_func(file_paths, batch_size=200):
dataset = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(batch_size)
#split the dataset into features and labels ?
label = dataset.map(lambda x: col for col in x if col=='label') # error: "x" not defined
features = dataset.map(lambda x: col for col in x if col!='label')
return features, label
如何将数据集按列名拆分为两个数据集?
更新
ds = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
type(ds)
# tensorflow.python.data.ops.readers.TFRecordDatasetV2
label = ds.map(lambda x: x['label'])
# TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'label'
for row in ds:
print(row.numpy()) # show a lot of hex chars
IIUC,你可以这样试试:
import tensorflow as tf
# Create dummy data
ds1 = tf.data.Dataset.from_tensors(({'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}))
ds2 = tf.data.Dataset.from_tensors(({'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}))
ds = ds1.concatenate(ds2)
label = ds.map(lambda x: x['label'])
features = ds.map(lambda x: (x['id'], x['val1'], x['val2'], x['val3']))
for l in label:
print(l)
for f in features:
print(f)
tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)
(<tf.Tensor: shape=(), dtype=int32, numpy=11>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.22, 0.36], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([81], dtype=int32)>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'swimming', b'running', b'jumpoing'], dtype=object)>)
(<tf.Tensor: shape=(), dtype=int32, numpy=29>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.97, 0.52], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([627], dtype=int32)>, <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'baseball', b'football', b'basketball'], dtype=object)>)
或使用动态键:
import tensorflow as tf
import numpy as np
# Create dummy data
ds1 = tf.data.Dataset.from_tensors(({'id': 11, 'val1': [0.22, 0.36], 'val2': [81], 'val3': ['swimming', 'running', 'jumpoing'], 'label': 0.0}))
ds2 = tf.data.Dataset.from_tensors(({'id': 29, 'val1': [0.97, 0.52], 'val2': [627], 'val3': ['baseball', 'football', 'basketball'], 'label': 1.0}))
ds = ds1.concatenate(ds2)
label = ds.map(lambda x: x['label'])
features = ds.map(lambda x: (list(map(x.get, list(np.setdiff1d(list(x.keys()),['label']))))))