在转换过程中从 tensorflow 对象中提取 numpy 值
extracting numpy value from tensorflow object during transformation
我正在尝试使用 tensorflow 获取词嵌入,并且我已经使用我的语料库创建了相邻的工作列表。
我的词汇表中唯一单词的数量为 8000,相邻单词列表的数量约为 160 万
Word Lists sample photo
由于数据非常大,我正在尝试将单词列表分批写入 TFRecords 文件。
def save_tfrecords_wordlist(toprocess_word_lists, path ):
writer = tf.io.TFRecordWriter(path)
for word_list in toprocess_word_lists:
features=tf.train.Features(
feature={
'word_list_X': tf.train.Feature( bytes_list=tf.train.BytesList(value=[word_list[0].encode('utf-8')] )),
'word_list_Y': tf.train.Feature( bytes_list=tf.train.BytesList(value=[word_list[1].encode('utf-8') ]))
}
)
example = tf.train.Example(features = features)
writer.write(example.SerializeToString())
writer.close()
定义批次
batches = [0,250000,500000,750000,1000000,1250000,1500000,1641790]
for i in range(len(batches) - 1 ):
batches_start = batches[i]
batches_end = batches[i + 1]
print( str(batches_start) + " -- " + str(batches_end ))
toprocess_word_lists = word_lists[batches_start:batches_end]
save_tfrecords_wordlist( toprocess_word_lists, path +"/TFRecords/data_" + str(i) +".tfrecords")
##############################
def _parse_function(example_proto):
features = {"word_list_X": tf.io.FixedLenFeature((), tf.string),
"word_list_Y": tf.io.FixedLenFeature((), tf.string)}
parsed_features = tf.io.parse_single_example(example_proto, features)
"""
word_list_X = parsed_features['word_list_X'].numpy()
word_list_Y = parsed_features['word_list_Y'].numpy()
## need help is getting the numpy values from parsed_features variable so that i can get the one hot encoding matrix which can be directly sent to tensorflow for training
sample word_list_X value is <tf.Tensor: shape=(10,), dtype=string, numpy=array([b'for', b'for', b'for', b'you', b'you', b'you', b'you', b'to',b'to', b'to'], dtype=object)>
sample word_list_Y value is <tf.Tensor: shape=(10,), dtype=string, numpy=array([b'is', b'to', b'recommend', b'to', b'for', b'contact', b'is',b'contact', b'you', b'the'], dtype=object)>)
"""
return parsed_features['word_list_X'],parsed_features['word_list_Y']
filenames = [ path + "/JustEat_TFRecords/data.tfrecords" ]
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(_parse_function)
dataset = dataset.batch(10)
# Defining the size of the embedding
embed_size = 100
# Defining the neural network
inp = tf.keras.Input(shape=(7958,))
x = tf.keras.layers.Dense(units=embed_size, activation='linear')(inp)
x = tf.keras.layers.Dense(units=7958, activation='softmax')(x)
model = tf.keras.Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
# Optimizing the network weights
#model.fit( x=X, y=Y, batch_size=256,epochs= 100)
model.fit(dataset,epochs= 2)
您似乎无法从映射函数内部调用 .numpy() 函数 (1, 2) although i was able to manage by using the py_function from (doc)。
在下面的例子中,我将我解析的数据集映射到一个函数,该函数将我的图像转换为np.uint8
使用 matplotlib 绘制它们。
records_path = data_directory+'TFRecords'+'/data_0.tfrecord'
# Create a dataset
dataset = tf.data.TFRecordDataset(filenames=records_path)
# Map our dataset to the parsing function
parsed_dataset = dataset.map(parsing_fn)
converted_dataset = parsed_dataset.map(lambda image,label:
tf.py_function(func=converting_function,
inp=[image,label],
Tout=[np.uint8,tf.int64]))
# Gets the iterator
iterator = tf.compat.v1.data.make_one_shot_iterator(converted_dataset)
for i in range(5):
image,label = iterator.get_next()
plt.imshow(image)
plt.show()
print('label: ', label)
输出:
解析函数:
def parsing_fn(serialized):
# Define a dict with the data-names and types we expect to
# find in the TFRecords file.
features = \
{
'image': tf.io.FixedLenFeature([], tf.string),
'label': tf.io.FixedLenFeature([], tf.int64)
}
# Parse the serialized data so we get a dict with our data.
parsed_example = tf.io.parse_single_example(serialized=serialized,
features=features)
# Get the image as raw bytes.
image_raw = parsed_example['image']
# Decode the raw bytes so it becomes a tensor with type.
image = tf.io.decode_jpeg(image_raw)
# Get the label associated with the image.
label = parsed_example['label']
# The image and label are now correct TensorFlow types.
return image, label
相关问题:
更新:实际上并没有签出,但 tf.shape() 似乎也是一个很有前途的选择。
我正在尝试使用 tensorflow 获取词嵌入,并且我已经使用我的语料库创建了相邻的工作列表。
我的词汇表中唯一单词的数量为 8000,相邻单词列表的数量约为 160 万
Word Lists sample photo
由于数据非常大,我正在尝试将单词列表分批写入 TFRecords 文件。
def save_tfrecords_wordlist(toprocess_word_lists, path ):
writer = tf.io.TFRecordWriter(path)
for word_list in toprocess_word_lists:
features=tf.train.Features(
feature={
'word_list_X': tf.train.Feature( bytes_list=tf.train.BytesList(value=[word_list[0].encode('utf-8')] )),
'word_list_Y': tf.train.Feature( bytes_list=tf.train.BytesList(value=[word_list[1].encode('utf-8') ]))
}
)
example = tf.train.Example(features = features)
writer.write(example.SerializeToString())
writer.close()
定义批次
batches = [0,250000,500000,750000,1000000,1250000,1500000,1641790]
for i in range(len(batches) - 1 ):
batches_start = batches[i]
batches_end = batches[i + 1]
print( str(batches_start) + " -- " + str(batches_end ))
toprocess_word_lists = word_lists[batches_start:batches_end]
save_tfrecords_wordlist( toprocess_word_lists, path +"/TFRecords/data_" + str(i) +".tfrecords")
##############################
def _parse_function(example_proto):
features = {"word_list_X": tf.io.FixedLenFeature((), tf.string),
"word_list_Y": tf.io.FixedLenFeature((), tf.string)}
parsed_features = tf.io.parse_single_example(example_proto, features)
"""
word_list_X = parsed_features['word_list_X'].numpy()
word_list_Y = parsed_features['word_list_Y'].numpy()
## need help is getting the numpy values from parsed_features variable so that i can get the one hot encoding matrix which can be directly sent to tensorflow for training
sample word_list_X value is <tf.Tensor: shape=(10,), dtype=string, numpy=array([b'for', b'for', b'for', b'you', b'you', b'you', b'you', b'to',b'to', b'to'], dtype=object)>
sample word_list_Y value is <tf.Tensor: shape=(10,), dtype=string, numpy=array([b'is', b'to', b'recommend', b'to', b'for', b'contact', b'is',b'contact', b'you', b'the'], dtype=object)>)
"""
return parsed_features['word_list_X'],parsed_features['word_list_Y']
filenames = [ path + "/JustEat_TFRecords/data.tfrecords" ]
dataset = tf.data.TFRecordDataset(filenames)
dataset = dataset.map(_parse_function)
dataset = dataset.batch(10)
# Defining the size of the embedding
embed_size = 100
# Defining the neural network
inp = tf.keras.Input(shape=(7958,))
x = tf.keras.layers.Dense(units=embed_size, activation='linear')(inp)
x = tf.keras.layers.Dense(units=7958, activation='softmax')(x)
model = tf.keras.Model(inputs=inp, outputs=x)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')
# Optimizing the network weights
#model.fit( x=X, y=Y, batch_size=256,epochs= 100)
model.fit(dataset,epochs= 2)
您似乎无法从映射函数内部调用 .numpy() 函数 (1, 2) although i was able to manage by using the py_function from (doc)。
在下面的例子中,我将我解析的数据集映射到一个函数,该函数将我的图像转换为np.uint8
使用 matplotlib 绘制它们。
records_path = data_directory+'TFRecords'+'/data_0.tfrecord'
# Create a dataset
dataset = tf.data.TFRecordDataset(filenames=records_path)
# Map our dataset to the parsing function
parsed_dataset = dataset.map(parsing_fn)
converted_dataset = parsed_dataset.map(lambda image,label:
tf.py_function(func=converting_function,
inp=[image,label],
Tout=[np.uint8,tf.int64]))
# Gets the iterator
iterator = tf.compat.v1.data.make_one_shot_iterator(converted_dataset)
for i in range(5):
image,label = iterator.get_next()
plt.imshow(image)
plt.show()
print('label: ', label)
输出:
解析函数:
def parsing_fn(serialized):
# Define a dict with the data-names and types we expect to
# find in the TFRecords file.
features = \
{
'image': tf.io.FixedLenFeature([], tf.string),
'label': tf.io.FixedLenFeature([], tf.int64)
}
# Parse the serialized data so we get a dict with our data.
parsed_example = tf.io.parse_single_example(serialized=serialized,
features=features)
# Get the image as raw bytes.
image_raw = parsed_example['image']
# Decode the raw bytes so it becomes a tensor with type.
image = tf.io.decode_jpeg(image_raw)
# Get the label associated with the image.
label = parsed_example['label']
# The image and label are now correct TensorFlow types.
return image, label
相关问题:
更新:实际上并没有签出,但 tf.shape() 似乎也是一个很有前途的选择。