嵌入文本数据的 TFRecords
TFRecords for embedded text data
对于 Uni 的一个项目,我正在使用 TensorFlow 中的神经网络实现问答系统(目前 bAbI 数据集任务 5,参见 https://research.fb.com/downloads/babi/)系统,我想使用我的输入管道的 TFRecords。
我的想法是,一个 TFRecords 术语中的示例应该包括问题的上下文、问题本身、答案和支持的句子编号(int 指向上下文中最重要的句子,以便能够来回答问题)。这是我定义函数的方式:
def make_example(context, question, answer, support):
ex = tf.train.SequenceExample()
fl_context = ex.feature_lists.feature_list["context"]
fl_question = ex.feature_lists.feature_list["question"]
fl_answer = ex.feature_lists.feature_list["answer"]
ex.context.feature["support"].int64_list.value.append(support)
for token in context:
fl_context.feature.add().int64_list.value.append(token)
for qWord in question:
fl_question.feature.add().int64_list.value.append(qWord)
for ansWord in answer:
fl_answer.feature.add().int64_list.value.append(ansWord)
fl_support.feature.add().int64_list.value.append(support)
return ex
但是,在传递上下文、问题和答案之前,我想嵌入单词并用它们的 GloVe 向量表示它们,即用 (m,d) 矩阵表示,其中 m 是句子,d是每个词向量的维数。我的 make_example
函数似乎不能很好地处理这个问题,因为我得到:
theTypeError: (array([[ -9.58490000e-01, 1.73210000e-01,
2.51650000e-01,
-5.61450000e-01, -1.21440000e-01, 1.54350000e+00,
-1.28930000e+00, -9.77790000e-01, -1.35480000e-01,
-6.06930000e-01, -1.37810000e+00, 6.33470000e-01,
1.33160000e-01, 2.46320000e-01, 6.60260000e-01,
-4.46130000e-02, 4.09510000e-01, -7.61670000e-01,
4.67530000e-01, -6.67810000e-01, 2.99850000e-01,
-2.74810000e-01, -5.47990000e-01, -8.56820000e-01,
5.30880000e-02, -2.01700000e+00, 7.48530000e-01,
-1.27830000e-01, 1.32050000e-01, -2.19450000e-01,
2.29830000e+00, -3.17680000e-01, -8.64940000e-01,
-1.08630000e-01, -8.13770000e-02, -7.03420000e-01,
4.60000000e-01, -3.34730000e-01, 4.37030000e-02,
-7.55080000e-01, -6.89710000e-01, 7.14380000e-01,
-8.35950000e-02, 1.58620000e-02, -5.23850000e-01,
1.72520000e-01, -4.98740000e-01, 2.30810000e-01,
-3.64690000e-01, 1.5 has type <class 'tuple'>, but expected one of:
(<class 'int'>,)
指向上面的 fl_context.feature.add().int64_list.value.append(token)
...有人可以指出我误解了 TFRecords 概念的地方,并给我建议如何解决这个问题吗?
我搜索了很多学习资料,但通常 TFRecords 上的示例都是带有图像数据的。到目前为止,我的参考资料是 https://medium.com/@TalPerry/getting-text-into-tensorflow-with-the-dataset-api-ffb832c8bec6 and http://web.stanford.edu/class/cs20si/lectures/notes_09.pdf 。
提前致谢!
我的问题的答案可以在这里找到:https://github.com/simonada/q-and-a-tensorflow/blob/master/src/Q%26A%20with%20TF-%20TFRecords%20and%20Eager%20Execution.ipynb
我的做法如下:
将文本存储到 csv 文件中:每行(上下文、问题、答案)
定义一个函数将序列转换为tf_example,在我的例子中是
def sequence_to_tf_example(context, question, answer):
context_ids= vectorize(context, False, word_to_index)
question_ids= vectorize(question, False, word_to_index)
answer_ids= vectorize(answer, True, word_to_index)
ex = tf.train.SequenceExample()
context_tokens = ex.feature_lists.feature_list["context"]
question_tokens = ex.feature_lists.feature_list["question"]
answer_tokens = ex.feature_lists.feature_list["answer"]
for token in context_ids:
context_tokens.feature.add().int64_list.value.append(token)
for token in question_ids:
question_tokens.feature.add().int64_list.value.append(token)
for token in answer_ids:
#print(token)
answer_tokens.feature.add().int64_list.value.append(token)
return ex
定义写入函数
def write_example_to_tfrecord(context, question, answer, tfrecord_file, writer):
example= sequence_to_tf_example(context, question, answer)
writer.write(example.SerializeToString())
def write_data_to_tf_record(filename):
file_csv= filename+'.csv'
file_tfrecords= filename+'.tfrecords'
with open(file_csv) as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
next(readCSV) #skip header
writer= tf.python_io.TFRecordWriter(file_tfrecords)
for row in readCSV:
write_example_to_tfrecord(row[0], row[1], row[2], file_tfrecords, writer)
writer.close()
定义读取函数
def read_from_tfrecord(ex):
sequence_features = {
"context": tf.FixedLenSequenceFeature([], dtype=tf.int64),
"question": tf.FixedLenSequenceFeature([], dtype=tf.int64),
"answer": tf.FixedLenSequenceFeature([], dtype=tf.int64)
}
# Parse the example (returns a dictionary of tensors)
_, sequence_parsed = tf.parse_single_sequence_example(
serialized=ex,
sequence_features=sequence_features
)
return {"context": sequence_parsed['context'], "question": sequence_parsed['question'],
"answer": sequence_parsed['answer']}
创建数据集
def make_dataset(path, batch_size=128):
'''
Makes a Tensorflow dataset that is shuffled, batched and parsed.
'''
# Read a tf record file. This makes a dataset of raw TFRecords
dataset = tf.data.TFRecordDataset([path])
# Apply/map the parse function to every record. Now the dataset is a bunch of dictionaries of Tensors
dataset = dataset.map(read_from_tfrecord)
#Shuffle the dataset
dataset = dataset.shuffle(buffer_size=10000)
# specify padding for each tensor seperatly
dataset = dataset.padded_batch(batch_size, padded_shapes={
"context": tf.TensorShape([None]),
"question": tf.TensorShape([None]),
"answer": tf.TensorShape([None])
})
return dataset
对于 Uni 的一个项目,我正在使用 TensorFlow 中的神经网络实现问答系统(目前 bAbI 数据集任务 5,参见 https://research.fb.com/downloads/babi/)系统,我想使用我的输入管道的 TFRecords。
我的想法是,一个 TFRecords 术语中的示例应该包括问题的上下文、问题本身、答案和支持的句子编号(int 指向上下文中最重要的句子,以便能够来回答问题)。这是我定义函数的方式:
def make_example(context, question, answer, support):
ex = tf.train.SequenceExample()
fl_context = ex.feature_lists.feature_list["context"]
fl_question = ex.feature_lists.feature_list["question"]
fl_answer = ex.feature_lists.feature_list["answer"]
ex.context.feature["support"].int64_list.value.append(support)
for token in context:
fl_context.feature.add().int64_list.value.append(token)
for qWord in question:
fl_question.feature.add().int64_list.value.append(qWord)
for ansWord in answer:
fl_answer.feature.add().int64_list.value.append(ansWord)
fl_support.feature.add().int64_list.value.append(support)
return ex
但是,在传递上下文、问题和答案之前,我想嵌入单词并用它们的 GloVe 向量表示它们,即用 (m,d) 矩阵表示,其中 m 是句子,d是每个词向量的维数。我的 make_example
函数似乎不能很好地处理这个问题,因为我得到:
theTypeError: (array([[ -9.58490000e-01, 1.73210000e-01,
2.51650000e-01,
-5.61450000e-01, -1.21440000e-01, 1.54350000e+00,
-1.28930000e+00, -9.77790000e-01, -1.35480000e-01,
-6.06930000e-01, -1.37810000e+00, 6.33470000e-01,
1.33160000e-01, 2.46320000e-01, 6.60260000e-01,
-4.46130000e-02, 4.09510000e-01, -7.61670000e-01,
4.67530000e-01, -6.67810000e-01, 2.99850000e-01,
-2.74810000e-01, -5.47990000e-01, -8.56820000e-01,
5.30880000e-02, -2.01700000e+00, 7.48530000e-01,
-1.27830000e-01, 1.32050000e-01, -2.19450000e-01,
2.29830000e+00, -3.17680000e-01, -8.64940000e-01,
-1.08630000e-01, -8.13770000e-02, -7.03420000e-01,
4.60000000e-01, -3.34730000e-01, 4.37030000e-02,
-7.55080000e-01, -6.89710000e-01, 7.14380000e-01,
-8.35950000e-02, 1.58620000e-02, -5.23850000e-01,
1.72520000e-01, -4.98740000e-01, 2.30810000e-01,
-3.64690000e-01, 1.5 has type <class 'tuple'>, but expected one of:
(<class 'int'>,)
指向上面的 fl_context.feature.add().int64_list.value.append(token)
...有人可以指出我误解了 TFRecords 概念的地方,并给我建议如何解决这个问题吗?
我搜索了很多学习资料,但通常 TFRecords 上的示例都是带有图像数据的。到目前为止,我的参考资料是 https://medium.com/@TalPerry/getting-text-into-tensorflow-with-the-dataset-api-ffb832c8bec6 and http://web.stanford.edu/class/cs20si/lectures/notes_09.pdf 。
提前致谢!
我的问题的答案可以在这里找到:https://github.com/simonada/q-and-a-tensorflow/blob/master/src/Q%26A%20with%20TF-%20TFRecords%20and%20Eager%20Execution.ipynb
我的做法如下:
将文本存储到 csv 文件中:每行(上下文、问题、答案)
定义一个函数将序列转换为tf_example,在我的例子中是
def sequence_to_tf_example(context, question, answer): context_ids= vectorize(context, False, word_to_index) question_ids= vectorize(question, False, word_to_index) answer_ids= vectorize(answer, True, word_to_index) ex = tf.train.SequenceExample() context_tokens = ex.feature_lists.feature_list["context"] question_tokens = ex.feature_lists.feature_list["question"] answer_tokens = ex.feature_lists.feature_list["answer"] for token in context_ids: context_tokens.feature.add().int64_list.value.append(token) for token in question_ids: question_tokens.feature.add().int64_list.value.append(token) for token in answer_ids: #print(token) answer_tokens.feature.add().int64_list.value.append(token) return ex
定义写入函数
def write_example_to_tfrecord(context, question, answer, tfrecord_file, writer): example= sequence_to_tf_example(context, question, answer) writer.write(example.SerializeToString()) def write_data_to_tf_record(filename): file_csv= filename+'.csv' file_tfrecords= filename+'.tfrecords' with open(file_csv) as csvfile: readCSV = csv.reader(csvfile, delimiter=',') next(readCSV) #skip header writer= tf.python_io.TFRecordWriter(file_tfrecords) for row in readCSV: write_example_to_tfrecord(row[0], row[1], row[2], file_tfrecords, writer) writer.close()
定义读取函数
def read_from_tfrecord(ex): sequence_features = { "context": tf.FixedLenSequenceFeature([], dtype=tf.int64), "question": tf.FixedLenSequenceFeature([], dtype=tf.int64), "answer": tf.FixedLenSequenceFeature([], dtype=tf.int64) } # Parse the example (returns a dictionary of tensors) _, sequence_parsed = tf.parse_single_sequence_example( serialized=ex, sequence_features=sequence_features ) return {"context": sequence_parsed['context'], "question": sequence_parsed['question'], "answer": sequence_parsed['answer']}
创建数据集
def make_dataset(path, batch_size=128): ''' Makes a Tensorflow dataset that is shuffled, batched and parsed. ''' # Read a tf record file. This makes a dataset of raw TFRecords dataset = tf.data.TFRecordDataset([path]) # Apply/map the parse function to every record. Now the dataset is a bunch of dictionaries of Tensors dataset = dataset.map(read_from_tfrecord) #Shuffle the dataset dataset = dataset.shuffle(buffer_size=10000) # specify padding for each tensor seperatly dataset = dataset.padded_batch(batch_size, padded_shapes={ "context": tf.TensorShape([None]), "question": tf.TensorShape([None]), "answer": tf.TensorShape([None]) }) return dataset