如何在 Keras 中实现分层模型?
How to implement hierarchical model in Keras?
我正在尝试重建 https://arxiv.org/abs/1709.04250 中的模型。
作者将文本分解成话语(将它们想象成句子),然后使用双向 LSTM 组合这些话语,然后再次使用双向 LSTM,这次是在一系列话语表示上并完成它通过使用 CRF 层来预测与每个话语相关的标签。
这是模型架构的视觉效果:
enter image description here
这是我的尝试,在 Keras 中实现并使用来自 https://github.com/keras-team/keras-contrib 的 CRF 层:
embedding_layer = Embedding(len(word2id) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_nr_words, trainable=False)
crf = CRF(n_tags, sparse_target=True)
utterance_encoder = Sequential()
utterance_encoder.add(embedding_layer)
utterance_encoder.add(Bidirectional(LSTM(256, return_sequences=True)))
#This is not the pooling used in the paper but should only affect performance:
utterance_encoder.add(AveragePooling1D(max_nr_words))
utterance_encoder.add(Flatten())
utterance_encoder.summary()
model = Sequential()
model.add(TimeDistributed(utterance_encoder, input_shape = (max_nr_utterances, max_nr_words)))
model.add(Bidirectional(LSTM(256, return_sequences = True)))
model.add(crf)
model.summary()
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics = [crf_viterbi_accuracy])
model.fit(X, y, batch_size = 1)
# Here, X is of shape (51, 3391, 431) (51 documents featuring (max) 3391 utterances of (max) 431 words (represented by integer IDs)
# y is of shape (51, 3391, 52) (51 documents featuring 3391 utterances each corresponding to one of 52 labels)
不幸的是,失败并出现以下错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-9bb38cc64dfb> in <module>
22 #model.compile(optimizer="adam", loss='categorical_crossentropy', metrics = ["acc"])
23
---> 24 model.fit(X, y, batch_size = 1)
~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
821 # This is the first call of __call__, so we have to initialize.
822 initializers = []
--> 823 self._initialize(args, kwds, add_initializers_to=initializers)
824 finally:
825 # At this point we know that the initialization is complete (or less
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
695 self._concrete_stateful_fn = (
696 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 697 *args, **kwds))
698
699 def invalid_creator_scope(*unused_args, **unused_kwds):
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
2853 args, kwargs = None, None
2854 with self._lock:
-> 2855 graph_function, _, _ = self._maybe_define_function(args, kwargs)
2856 return graph_function
2857
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
3211
3212 self._function_cache.missed.add(call_context_key)
-> 3213 graph_function = self._create_graph_function(args, kwargs)
3214 self._function_cache.primary[cache_key] = graph_function
3215 return graph_function, args, kwargs
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
3073 arg_names=arg_names,
3074 override_flat_arg_shapes=override_flat_arg_shapes,
-> 3075 capture_by_value=self._capture_by_value),
3076 self._function_attributes,
3077 function_spec=self.function_spec,
~/.local/lib/python3.6/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
984 _, original_func = tf_decorator.unwrap(python_func)
985
--> 986 func_outputs = python_func(*func_args, **func_kwargs)
987
988 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
598 # __wrapped__ allows AutoGraph to swap in a converted function. We give
599 # the function a weak reference to itself to avoid a reference cycle.
--> 600 return weak_wrapped_fn().__wrapped__(*args, **kwds)
601 weak_wrapped_fn = weakref.ref(wrapped_fn)
602
~/.local/lib/python3.6/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
971 except Exception as e: # pylint:disable=broad-except
972 if hasattr(e, "ag_error_metadata"):
--> 973 raise e.ag_error_metadata.to_exception(e)
974 else:
975 raise
ValueError: in user code:
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:806 train_function *
return step_function(self, iterator)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:796 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
return fn(*args, **kwargs)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:789 run_step **
outputs = model.train_step(data)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:757 train_step
self.trainable_variables)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:2737 _minimize
trainable_variables))
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:562 _aggregate_gradients
filtered_grads_and_vars = _filter_grads(grads_and_vars)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1271 _filter_grads
([v.name for _, v in grads_and_vars],))
ValueError: No gradients provided for any variable: ['bidirectional_2/forward_lstm_2/lstm_cell_7/kernel:0', 'bidirectional_2/forward_lstm_2/lstm_cell_7/recurrent_kernel:0', 'bidirectional_2/forward_lstm_2/lstm_cell_7/bias:0', 'bidirectional_2/backward_lstm_2/lstm_cell_8/kernel:0', 'bidirectional_2/backward_lstm_2/lstm_cell_8/recurrent_kernel:0', 'bidirectional_2/backward_lstm_2/lstm_cell_8/bias:0', 'bidirectional_3/forward_lstm_3/lstm_cell_10/kernel:0', 'bidirectional_3/forward_lstm_3/lstm_cell_10/recurrent_kernel:0', 'bidirectional_3/forward_lstm_3/lstm_cell_10/bias:0', 'bidirectional_3/backward_lstm_3/lstm_cell_11/kernel:0', 'bidirectional_3/backward_lstm_3/lstm_cell_11/recurrent_kernel:0', 'bidirectional_3/backward_lstm_3/lstm_cell_11/bias:0', 'crf_1/kernel:0', 'crf_1/chain_kernel:0', 'crf_1/bias:0', 'crf_1/left_boundary:0', 'crf_1/right_boundary:0'].
模型摘要以防有帮助:
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 431, 300) 2867400
_________________________________________________________________
bidirectional_2 (Bidirection (None, 431, 512) 1140736
_________________________________________________________________
average_pooling1d_1 (Average (None, 1, 512) 0
_________________________________________________________________
flatten_1 (Flatten) (None, 512) 0
=================================================================
Total params: 4,008,136
Trainable params: 1,140,736
Non-trainable params: 2,867,400
_________________________________________________________________
Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
time_distributed_1 (TimeDist (None, 3391, 512) 4008136
_________________________________________________________________
bidirectional_3 (Bidirection (None, 3391, 512) 1574912
_________________________________________________________________
crf_1 (CRF) (None, 3391, 52) 29484
=================================================================
Total params: 5,612,532
Trainable params: 2,745,132
Non-trainable params: 2,867,400
________________________________________________________________
如果我用密集层替换 CRF 层(仅用于测试),最终会使用大量内存,我无法 运行 任何 batch_sizes > 1。(但这是一个单独的问题)。
任何 advice/other 实现,即使在 PyTorch/Tensorflow 中,如有必要,我们将不胜感激。
谢谢
编辑:
https://github.com/YanWenqiang/HBLSTM-CRF 具有原始作者对该模型的 tensorflow 实现,但它没有维护并且对我来说是中断的。
我最终使用了 https://github.com/xuxingya/tf2crf,它得到维护并与 tf2 一起工作。
我正在尝试重建 https://arxiv.org/abs/1709.04250 中的模型。
作者将文本分解成话语(将它们想象成句子),然后使用双向 LSTM 组合这些话语,然后再次使用双向 LSTM,这次是在一系列话语表示上并完成它通过使用 CRF 层来预测与每个话语相关的标签。
这是模型架构的视觉效果: enter image description here
这是我的尝试,在 Keras 中实现并使用来自 https://github.com/keras-team/keras-contrib 的 CRF 层:
embedding_layer = Embedding(len(word2id) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_nr_words, trainable=False)
crf = CRF(n_tags, sparse_target=True)
utterance_encoder = Sequential()
utterance_encoder.add(embedding_layer)
utterance_encoder.add(Bidirectional(LSTM(256, return_sequences=True)))
#This is not the pooling used in the paper but should only affect performance:
utterance_encoder.add(AveragePooling1D(max_nr_words))
utterance_encoder.add(Flatten())
utterance_encoder.summary()
model = Sequential()
model.add(TimeDistributed(utterance_encoder, input_shape = (max_nr_utterances, max_nr_words)))
model.add(Bidirectional(LSTM(256, return_sequences = True)))
model.add(crf)
model.summary()
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics = [crf_viterbi_accuracy])
model.fit(X, y, batch_size = 1)
# Here, X is of shape (51, 3391, 431) (51 documents featuring (max) 3391 utterances of (max) 431 words (represented by integer IDs)
# y is of shape (51, 3391, 52) (51 documents featuring 3391 utterances each corresponding to one of 52 labels)
不幸的是,失败并出现以下错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-3-9bb38cc64dfb> in <module>
22 #model.compile(optimizer="adam", loss='categorical_crossentropy', metrics = ["acc"])
23
---> 24 model.fit(X, y, batch_size = 1)
~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
~/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1096 batch_size=batch_size):
1097 callbacks.on_train_batch_begin(step)
-> 1098 tmp_logs = train_function(iterator)
1099 if data_handler.should_sync:
1100 context.async_wait()
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
778 else:
779 compiler = "nonXla"
--> 780 result = self._call(*args, **kwds)
781
782 new_tracing_count = self._get_tracing_count()
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
821 # This is the first call of __call__, so we have to initialize.
822 initializers = []
--> 823 self._initialize(args, kwds, add_initializers_to=initializers)
824 finally:
825 # At this point we know that the initialization is complete (or less
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in _initialize(self, args, kwds, add_initializers_to)
695 self._concrete_stateful_fn = (
696 self._stateful_fn._get_concrete_function_internal_garbage_collected( # pylint: disable=protected-access
--> 697 *args, **kwds))
698
699 def invalid_creator_scope(*unused_args, **unused_kwds):
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _get_concrete_function_internal_garbage_collected(self, *args, **kwargs)
2853 args, kwargs = None, None
2854 with self._lock:
-> 2855 graph_function, _, _ = self._maybe_define_function(args, kwargs)
2856 return graph_function
2857
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _maybe_define_function(self, args, kwargs)
3211
3212 self._function_cache.missed.add(call_context_key)
-> 3213 graph_function = self._create_graph_function(args, kwargs)
3214 self._function_cache.primary[cache_key] = graph_function
3215 return graph_function, args, kwargs
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/function.py in _create_graph_function(self, args, kwargs, override_flat_arg_shapes)
3073 arg_names=arg_names,
3074 override_flat_arg_shapes=override_flat_arg_shapes,
-> 3075 capture_by_value=self._capture_by_value),
3076 self._function_attributes,
3077 function_spec=self.function_spec,
~/.local/lib/python3.6/site-packages/tensorflow/python/framework/func_graph.py in func_graph_from_py_func(name, python_func, args, kwargs, signature, func_graph, autograph, autograph_options, add_control_dependencies, arg_names, op_return_value, collections, capture_by_value, override_flat_arg_shapes)
984 _, original_func = tf_decorator.unwrap(python_func)
985
--> 986 func_outputs = python_func(*func_args, **func_kwargs)
987
988 # invariant: `func_outputs` contains only Tensors, CompositeTensors,
~/.local/lib/python3.6/site-packages/tensorflow/python/eager/def_function.py in wrapped_fn(*args, **kwds)
598 # __wrapped__ allows AutoGraph to swap in a converted function. We give
599 # the function a weak reference to itself to avoid a reference cycle.
--> 600 return weak_wrapped_fn().__wrapped__(*args, **kwds)
601 weak_wrapped_fn = weakref.ref(wrapped_fn)
602
~/.local/lib/python3.6/site-packages/tensorflow/python/framework/func_graph.py in wrapper(*args, **kwargs)
971 except Exception as e: # pylint:disable=broad-except
972 if hasattr(e, "ag_error_metadata"):
--> 973 raise e.ag_error_metadata.to_exception(e)
974 else:
975 raise
ValueError: in user code:
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:806 train_function *
return step_function(self, iterator)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:796 step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
return self._call_for_each_replica(fn, args, kwargs)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
return fn(*args, **kwargs)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:789 run_step **
outputs = model.train_step(data)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:757 train_step
self.trainable_variables)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/engine/training.py:2737 _minimize
trainable_variables))
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:562 _aggregate_gradients
filtered_grads_and_vars = _filter_grads(grads_and_vars)
/home/jonas/.local/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/optimizer_v2.py:1271 _filter_grads
([v.name for _, v in grads_and_vars],))
ValueError: No gradients provided for any variable: ['bidirectional_2/forward_lstm_2/lstm_cell_7/kernel:0', 'bidirectional_2/forward_lstm_2/lstm_cell_7/recurrent_kernel:0', 'bidirectional_2/forward_lstm_2/lstm_cell_7/bias:0', 'bidirectional_2/backward_lstm_2/lstm_cell_8/kernel:0', 'bidirectional_2/backward_lstm_2/lstm_cell_8/recurrent_kernel:0', 'bidirectional_2/backward_lstm_2/lstm_cell_8/bias:0', 'bidirectional_3/forward_lstm_3/lstm_cell_10/kernel:0', 'bidirectional_3/forward_lstm_3/lstm_cell_10/recurrent_kernel:0', 'bidirectional_3/forward_lstm_3/lstm_cell_10/bias:0', 'bidirectional_3/backward_lstm_3/lstm_cell_11/kernel:0', 'bidirectional_3/backward_lstm_3/lstm_cell_11/recurrent_kernel:0', 'bidirectional_3/backward_lstm_3/lstm_cell_11/bias:0', 'crf_1/kernel:0', 'crf_1/chain_kernel:0', 'crf_1/bias:0', 'crf_1/left_boundary:0', 'crf_1/right_boundary:0'].
模型摘要以防有帮助:
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 431, 300) 2867400
_________________________________________________________________
bidirectional_2 (Bidirection (None, 431, 512) 1140736
_________________________________________________________________
average_pooling1d_1 (Average (None, 1, 512) 0
_________________________________________________________________
flatten_1 (Flatten) (None, 512) 0
=================================================================
Total params: 4,008,136
Trainable params: 1,140,736
Non-trainable params: 2,867,400
_________________________________________________________________
Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
time_distributed_1 (TimeDist (None, 3391, 512) 4008136
_________________________________________________________________
bidirectional_3 (Bidirection (None, 3391, 512) 1574912
_________________________________________________________________
crf_1 (CRF) (None, 3391, 52) 29484
=================================================================
Total params: 5,612,532
Trainable params: 2,745,132
Non-trainable params: 2,867,400
________________________________________________________________
如果我用密集层替换 CRF 层(仅用于测试),最终会使用大量内存,我无法 运行 任何 batch_sizes > 1。(但这是一个单独的问题)。
任何 advice/other 实现,即使在 PyTorch/Tensorflow 中,如有必要,我们将不胜感激。
谢谢
编辑: https://github.com/YanWenqiang/HBLSTM-CRF 具有原始作者对该模型的 tensorflow 实现,但它没有维护并且对我来说是中断的。
我最终使用了 https://github.com/xuxingya/tf2crf,它得到维护并与 tf2 一起工作。