Tensorflow 2.0:以函数式方式将数据集的数值特征打包在一起
Tensorflow 2.0: Packing numerical features of a dataset together in a functional way
我正在尝试从 here 复制 Tensorflow 教程代码,它应该下载 CSV
文件和预处理数据(直到将数值数据组合在一起)。
可重现的例子如下:
import tensorflow as tf
print("TF version is: {}".format(tf.__version__))
# Download data
train_url = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
test_url = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
train_path = tf.keras.utils.get_file("train.csv", train_url)
test_path = tf.keras.utils.get_file("test.csv", test_url)
# Get data into batched dataset
def get_dataset(path):
dataset = tf.data.experimental.make_csv_dataset(path
,batch_size=5
,num_epochs=1
,label_name='survived'
,na_value='?'
,ignore_errors=True)
return dataset
raw_train_dataset = get_dataset(train_path)
raw_test_dataset = get_dataset(test_path)
# Define numerical and categorical column lists
def get_df_batch(dataset):
for batch,label in dataset.take(1):
df = pd.DataFrame()
df['survived'] = label.numpy()
for key, value in batch.items():
df[key] = value.numpy()
return df
dfb = get_df_batch(raw_train_dataset)
num_columns = [i for i in dfb if (dfb[i].dtype != 'O' and i!='survived')]
cat_columns = [i for i in dfb if dfb[i].dtype == 'O']
# Combine numerical columns into one `numerics` column
class Pack():
def __init__(self,names):
self.names = names
def __call__(self,features, labels):
num_features = [features.pop(name) for name in self.names]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features["numerics"] = num_features
return features, labels
packed_train = raw_train_dataset.map(Pack(num_columns))
# Show what we got
def show_batch(dataset):
for batch, label in dataset.take(1):
for key, value in batch.items():
print("{:20s}: {}".format(key,value.numpy()))
show_batch(packed_train)
TF version is: 2.0.0
sex : [b'female' b'female' b'male' b'male' b'male']
class : [b'Third' b'First' b'Second' b'First' b'Third']
deck : [b'unknown' b'E' b'unknown' b'C' b'unknown']
embark_town : [b'Queenstown' b'Cherbourg' b'Southampton' b'Cherbourg' b'Queenstown']
alone : [b'n' b'n' b'y' b'n' b'n']
numerics : [[ 28. 1. 0. 15.5 ]
[ 40. 1. 1. 134.5 ]
[ 32. 0. 0. 10.5 ]
[ 49. 1. 0. 89.1042]
[ 2. 4. 1. 29.125 ]]
然后我尝试以函数方式组合数字特征,但失败了:
@tf.function
def pack_func(row, num_columns=num_columns):
features, labels = row
num_features = [features.pop(name) for name in num_columns]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features['numerics'] = num_features
return features, labels
packed_train = raw_train_dataset.map(pack_func)
部分回溯:
ValueError: in converted code:
:3 pack_func *
features, labels = row
ValueError: too many values to unpack (expected 2)
这里有 2 个问题:
如何在 Class Pack
的定义中的 def __call__(self,features, labels):
中分配 features
和 labels
。我的直觉应该将它们作为已定义的变量传递,尽管我完全不明白它们是在哪里定义的。
当我做的时候
for row in raw_train_dataset.take(1):
print(type(row))
print(len(row))
f,l = row
print(f)
print(l)
我看到raw_train_dataset
中的row
是一个tuple2,可以成功解包成features和labels。为什么不能通过 map
API 完成?您能否建议以函数方式组合数字特征的正确方法?
非常感谢!!!
经过一些研究和尝试,第二个问题的答案似乎是:
def pack_func(features, labels, num_columns=num_columns):
num_features = [features.pop(name) for name in num_columns]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features['numerics'] = num_features
return features, labels
packed_train = raw_train_dataset.map(pack_func)
show_batch(packed_train)
sex : [b'male' b'male' b'male' b'female' b'male']
class : [b'Third' b'Third' b'Third' b'First' b'Third']
deck : [b'unknown' b'unknown' b'unknown' b'E' b'unknown']
embark_town : [b'Southampton' b'Southampton' b'Queenstown' b'Cherbourg' b'Queenstown']
alone : [b'y' b'n' b'n' b'n' b'y']
numerics : [[24. 0. 0. 8.05 ]
[14. 5. 2. 46.9 ]
[ 2. 4. 1. 29.125 ]
[39. 1. 1. 83.1583]
[21. 0. 0. 7.7333]]
我正在尝试从 here 复制 Tensorflow 教程代码,它应该下载 CSV
文件和预处理数据(直到将数值数据组合在一起)。
可重现的例子如下:
import tensorflow as tf
print("TF version is: {}".format(tf.__version__))
# Download data
train_url = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
test_url = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
train_path = tf.keras.utils.get_file("train.csv", train_url)
test_path = tf.keras.utils.get_file("test.csv", test_url)
# Get data into batched dataset
def get_dataset(path):
dataset = tf.data.experimental.make_csv_dataset(path
,batch_size=5
,num_epochs=1
,label_name='survived'
,na_value='?'
,ignore_errors=True)
return dataset
raw_train_dataset = get_dataset(train_path)
raw_test_dataset = get_dataset(test_path)
# Define numerical and categorical column lists
def get_df_batch(dataset):
for batch,label in dataset.take(1):
df = pd.DataFrame()
df['survived'] = label.numpy()
for key, value in batch.items():
df[key] = value.numpy()
return df
dfb = get_df_batch(raw_train_dataset)
num_columns = [i for i in dfb if (dfb[i].dtype != 'O' and i!='survived')]
cat_columns = [i for i in dfb if dfb[i].dtype == 'O']
# Combine numerical columns into one `numerics` column
class Pack():
def __init__(self,names):
self.names = names
def __call__(self,features, labels):
num_features = [features.pop(name) for name in self.names]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features["numerics"] = num_features
return features, labels
packed_train = raw_train_dataset.map(Pack(num_columns))
# Show what we got
def show_batch(dataset):
for batch, label in dataset.take(1):
for key, value in batch.items():
print("{:20s}: {}".format(key,value.numpy()))
show_batch(packed_train)
TF version is: 2.0.0
sex : [b'female' b'female' b'male' b'male' b'male']
class : [b'Third' b'First' b'Second' b'First' b'Third']
deck : [b'unknown' b'E' b'unknown' b'C' b'unknown']
embark_town : [b'Queenstown' b'Cherbourg' b'Southampton' b'Cherbourg' b'Queenstown']
alone : [b'n' b'n' b'y' b'n' b'n']
numerics : [[ 28. 1. 0. 15.5 ]
[ 40. 1. 1. 134.5 ]
[ 32. 0. 0. 10.5 ]
[ 49. 1. 0. 89.1042]
[ 2. 4. 1. 29.125 ]]
然后我尝试以函数方式组合数字特征,但失败了:
@tf.function
def pack_func(row, num_columns=num_columns):
features, labels = row
num_features = [features.pop(name) for name in num_columns]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features['numerics'] = num_features
return features, labels
packed_train = raw_train_dataset.map(pack_func)
部分回溯:
ValueError: in converted code: :3 pack_func * features, labels = row ValueError: too many values to unpack (expected 2)
这里有 2 个问题:
如何在 Class
Pack
的定义中的def __call__(self,features, labels):
中分配features
和labels
。我的直觉应该将它们作为已定义的变量传递,尽管我完全不明白它们是在哪里定义的。当我做的时候
for row in raw_train_dataset.take(1):
print(type(row))
print(len(row))
f,l = row
print(f)
print(l)
我看到raw_train_dataset
中的row
是一个tuple2,可以成功解包成features和labels。为什么不能通过 map
API 完成?您能否建议以函数方式组合数字特征的正确方法?
非常感谢!!!
经过一些研究和尝试,第二个问题的答案似乎是:
def pack_func(features, labels, num_columns=num_columns):
num_features = [features.pop(name) for name in num_columns]
num_features = [tf.cast(feat, tf.float32) for feat in num_features]
num_features = tf.stack(num_features, axis=1)
features['numerics'] = num_features
return features, labels
packed_train = raw_train_dataset.map(pack_func)
show_batch(packed_train)
sex : [b'male' b'male' b'male' b'female' b'male']
class : [b'Third' b'Third' b'Third' b'First' b'Third']
deck : [b'unknown' b'unknown' b'unknown' b'E' b'unknown']
embark_town : [b'Southampton' b'Southampton' b'Queenstown' b'Cherbourg' b'Queenstown']
alone : [b'y' b'n' b'n' b'n' b'y']
numerics : [[24. 0. 0. 8.05 ]
[14. 5. 2. 46.9 ]
[ 2. 4. 1. 29.125 ]
[39. 1. 1. 83.1583]
[21. 0. 0. 7.7333]]