Tensorflow LSTM 中的 c_state 和 m_state 是什么?
What are c_state and m_state in Tensorflow LSTM?
Tensorflow r0.12 的 tf.nn.rnn_cell.LSTMCell 文档将此描述为 init:
tf.nn.rnn_cell.LSTMCell.__call__(inputs, state, scope=None)
其中state
如下:
state: if state_is_tuple is False, this must be a state Tensor, 2-D, batch x state_size. If state_is_tuple is True, this must be a tuple of state Tensors, both 2-D, with column sizes c_state and m_state.
什么是 c_state
和 m_state
,它们如何适合 LSTM?我在文档中的任何地方都找不到对它们的引用。
也许这段代码摘录会有所帮助
def __call__(self, inputs, state, scope=None):
"""Long short-term memory cell (LSTM)."""
with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell"
# Parameters of gates are concatenated into one multiply for efficiency.
if self._state_is_tuple:
c, h = state
else:
c, h = array_ops.split(1, 2, state)
concat = _linear([inputs, h], 4 * self._num_units, True)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
i, j, f, o = array_ops.split(1, 4, concat)
new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
self._activation(j))
new_h = self._activation(new_c) * sigmoid(o)
if self._state_is_tuple:
new_state = LSTMStateTuple(new_c, new_h)
else:
new_state = array_ops.concat(1, [new_c, new_h])
return new_h, new_state
我偶然发现了同样的问题,这是我的理解!简约 LSTM 示例:
import tensorflow as tf
sample_input = tf.constant([[1,2,3]],dtype=tf.float32)
LSTM_CELL_SIZE = 2
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=True)
state = (tf.zeros([1,LSTM_CELL_SIZE]),)*2
output, state_new = lstm_cell(sample_input, state)
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)
print sess.run(output)
注意 state_is_tuple=True
所以当传递 state
给这个 cell
时,它需要是 tuple
形式。 c_state
和 m_state
可能是 "Memory State" 和 "Cell State",但老实说我不确定,因为这些术语仅在文档中提及。在关于 LSTM
的代码和论文中 - 字母 h
和 c
通常用于表示 "output value" 和 "cell state"。
http://colah.github.io/posts/2015-08-Understanding-LSTMs/
这些张量代表细胞的组合内部状态,应该一起传递。旧方法是简单地连接它们,新方法是使用元组。
旧方法:
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=False)
state = tf.zeros([1,LSTM_CELL_SIZE*2])
output, state_new = lstm_cell(sample_input, state)
新方式:
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=True)
state = (tf.zeros([1,LSTM_CELL_SIZE]),)*2
output, state_new = lstm_cell(sample_input, state)
所以,基本上我们所做的一切,都是将 state
从长度为 4
的 1 个张量更改为长度为 2
的两个张量。内容保持不变。 [0,0,0,0]
变为 ([0,0],[0,0])
。 (这应该会使其更快)
我同意文档不清楚。查看 tf.nn.rnn_cell.LSTMCell.__call__
澄清(我从 TensorFlow 1.0.0 获取代码):
def __call__(self, inputs, state, scope=None):
"""Run one step of LSTM.
Args:
inputs: input Tensor, 2D, batch x num_units.
state: if `state_is_tuple` is False, this must be a state Tensor,
`2-D, batch x state_size`. If `state_is_tuple` is True, this must be a
tuple of state Tensors, both `2-D`, with column sizes `c_state` and
`m_state`.
scope: VariableScope for the created subgraph; defaults to "lstm_cell".
Returns:
A tuple containing:
- A `2-D, [batch x output_dim]`, Tensor representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is:
num_proj if num_proj was set,
num_units otherwise.
- Tensor(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
Raises:
ValueError: If input size cannot be inferred from inputs via
static shape inference.
"""
num_proj = self._num_units if self._num_proj is None else self._num_proj
if self._state_is_tuple:
(c_prev, m_prev) = state
else:
c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
dtype = inputs.dtype
input_size = inputs.get_shape().with_rank(2)[1]
if input_size.value is None:
raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
with vs.variable_scope(scope or "lstm_cell",
initializer=self._initializer) as unit_scope:
if self._num_unit_shards is not None:
unit_scope.set_partitioner(
partitioned_variables.fixed_size_partitioner(
self._num_unit_shards))
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True,
scope=scope)
i, j, f, o = array_ops.split(
value=lstm_matrix, num_or_size_splits=4, axis=1)
# Diagonal connections
if self._use_peepholes:
with vs.variable_scope(unit_scope) as projection_scope:
if self._num_unit_shards is not None:
projection_scope.set_partitioner(None)
w_f_diag = vs.get_variable(
"w_f_diag", shape=[self._num_units], dtype=dtype)
w_i_diag = vs.get_variable(
"w_i_diag", shape=[self._num_units], dtype=dtype)
w_o_diag = vs.get_variable(
"w_o_diag", shape=[self._num_units], dtype=dtype)
if self._use_peepholes:
c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
sigmoid(i + w_i_diag * c_prev) * self._activation(j))
else:
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
if self._cell_clip is not None:
# pylint: disable=invalid-unary-operand-type
c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
# pylint: enable=invalid-unary-operand-type
if self._use_peepholes:
m = sigmoid(o + w_o_diag * c) * self._activation(c)
else:
m = sigmoid(o) * self._activation(c)
if self._num_proj is not None:
with vs.variable_scope("projection") as proj_scope:
if self._num_proj_shards is not None:
proj_scope.set_partitioner(
partitioned_variables.fixed_size_partitioner(
self._num_proj_shards))
m = _linear(m, self._num_proj, bias=False, scope=scope)
if self._proj_clip is not None:
# pylint: disable=invalid-unary-operand-type
m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
# pylint: enable=invalid-unary-operand-type
new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
array_ops.concat([c, m], 1))
return m, new_state
关键行是:
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
和
m = sigmoid(o) * self._activation(c)
和
new_state = (LSTMStateTuple(c, m)
如果将计算 c
和 m
的代码与 LSTM 方程(见下文)进行比较,您可以看到它对应于细胞状态(通常表示为 c
) 和隐藏状态(通常表示为 h
),分别为:
new_state = (LSTMStateTuple(c, m)
表示返回状态元组的第一个元素为c
(单元格状态a.k.a.c_state
),返回状态的第二个元素元组是 m
(隐藏状态 a.k.a。m_state
)。
https://github.com/tensorflow/tensorflow/blob/r1.2/tensorflow/python/ops/rnn_cell_impl.py
第 308 - 314 行
class LSTMStateTuple(_LSTMStateTuple):
"""LSTM 单元用于 state_size
、zero_state
和输出状态的元组。
按顺序存储两个元素:(c, h)
。
仅在 state_is_tuple=True
时使用。
"""
Tensorflow r0.12 的 tf.nn.rnn_cell.LSTMCell 文档将此描述为 init:
tf.nn.rnn_cell.LSTMCell.__call__(inputs, state, scope=None)
其中state
如下:
state: if state_is_tuple is False, this must be a state Tensor, 2-D, batch x state_size. If state_is_tuple is True, this must be a tuple of state Tensors, both 2-D, with column sizes c_state and m_state.
什么是 c_state
和 m_state
,它们如何适合 LSTM?我在文档中的任何地方都找不到对它们的引用。
也许这段代码摘录会有所帮助
def __call__(self, inputs, state, scope=None):
"""Long short-term memory cell (LSTM)."""
with vs.variable_scope(scope or type(self).__name__): # "BasicLSTMCell"
# Parameters of gates are concatenated into one multiply for efficiency.
if self._state_is_tuple:
c, h = state
else:
c, h = array_ops.split(1, 2, state)
concat = _linear([inputs, h], 4 * self._num_units, True)
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
i, j, f, o = array_ops.split(1, 4, concat)
new_c = (c * sigmoid(f + self._forget_bias) + sigmoid(i) *
self._activation(j))
new_h = self._activation(new_c) * sigmoid(o)
if self._state_is_tuple:
new_state = LSTMStateTuple(new_c, new_h)
else:
new_state = array_ops.concat(1, [new_c, new_h])
return new_h, new_state
我偶然发现了同样的问题,这是我的理解!简约 LSTM 示例:
import tensorflow as tf
sample_input = tf.constant([[1,2,3]],dtype=tf.float32)
LSTM_CELL_SIZE = 2
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=True)
state = (tf.zeros([1,LSTM_CELL_SIZE]),)*2
output, state_new = lstm_cell(sample_input, state)
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)
print sess.run(output)
注意 state_is_tuple=True
所以当传递 state
给这个 cell
时,它需要是 tuple
形式。 c_state
和 m_state
可能是 "Memory State" 和 "Cell State",但老实说我不确定,因为这些术语仅在文档中提及。在关于 LSTM
的代码和论文中 - 字母 h
和 c
通常用于表示 "output value" 和 "cell state"。
http://colah.github.io/posts/2015-08-Understanding-LSTMs/
这些张量代表细胞的组合内部状态,应该一起传递。旧方法是简单地连接它们,新方法是使用元组。
旧方法:
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=False)
state = tf.zeros([1,LSTM_CELL_SIZE*2])
output, state_new = lstm_cell(sample_input, state)
新方式:
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(LSTM_CELL_SIZE, state_is_tuple=True)
state = (tf.zeros([1,LSTM_CELL_SIZE]),)*2
output, state_new = lstm_cell(sample_input, state)
所以,基本上我们所做的一切,都是将 state
从长度为 4
的 1 个张量更改为长度为 2
的两个张量。内容保持不变。 [0,0,0,0]
变为 ([0,0],[0,0])
。 (这应该会使其更快)
我同意文档不清楚。查看 tf.nn.rnn_cell.LSTMCell.__call__
澄清(我从 TensorFlow 1.0.0 获取代码):
def __call__(self, inputs, state, scope=None):
"""Run one step of LSTM.
Args:
inputs: input Tensor, 2D, batch x num_units.
state: if `state_is_tuple` is False, this must be a state Tensor,
`2-D, batch x state_size`. If `state_is_tuple` is True, this must be a
tuple of state Tensors, both `2-D`, with column sizes `c_state` and
`m_state`.
scope: VariableScope for the created subgraph; defaults to "lstm_cell".
Returns:
A tuple containing:
- A `2-D, [batch x output_dim]`, Tensor representing the output of the
LSTM after reading `inputs` when previous state was `state`.
Here output_dim is:
num_proj if num_proj was set,
num_units otherwise.
- Tensor(s) representing the new state of LSTM after reading `inputs` when
the previous state was `state`. Same type and shape(s) as `state`.
Raises:
ValueError: If input size cannot be inferred from inputs via
static shape inference.
"""
num_proj = self._num_units if self._num_proj is None else self._num_proj
if self._state_is_tuple:
(c_prev, m_prev) = state
else:
c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
dtype = inputs.dtype
input_size = inputs.get_shape().with_rank(2)[1]
if input_size.value is None:
raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
with vs.variable_scope(scope or "lstm_cell",
initializer=self._initializer) as unit_scope:
if self._num_unit_shards is not None:
unit_scope.set_partitioner(
partitioned_variables.fixed_size_partitioner(
self._num_unit_shards))
# i = input_gate, j = new_input, f = forget_gate, o = output_gate
lstm_matrix = _linear([inputs, m_prev], 4 * self._num_units, bias=True,
scope=scope)
i, j, f, o = array_ops.split(
value=lstm_matrix, num_or_size_splits=4, axis=1)
# Diagonal connections
if self._use_peepholes:
with vs.variable_scope(unit_scope) as projection_scope:
if self._num_unit_shards is not None:
projection_scope.set_partitioner(None)
w_f_diag = vs.get_variable(
"w_f_diag", shape=[self._num_units], dtype=dtype)
w_i_diag = vs.get_variable(
"w_i_diag", shape=[self._num_units], dtype=dtype)
w_o_diag = vs.get_variable(
"w_o_diag", shape=[self._num_units], dtype=dtype)
if self._use_peepholes:
c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) * c_prev +
sigmoid(i + w_i_diag * c_prev) * self._activation(j))
else:
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
if self._cell_clip is not None:
# pylint: disable=invalid-unary-operand-type
c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
# pylint: enable=invalid-unary-operand-type
if self._use_peepholes:
m = sigmoid(o + w_o_diag * c) * self._activation(c)
else:
m = sigmoid(o) * self._activation(c)
if self._num_proj is not None:
with vs.variable_scope("projection") as proj_scope:
if self._num_proj_shards is not None:
proj_scope.set_partitioner(
partitioned_variables.fixed_size_partitioner(
self._num_proj_shards))
m = _linear(m, self._num_proj, bias=False, scope=scope)
if self._proj_clip is not None:
# pylint: disable=invalid-unary-operand-type
m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
# pylint: enable=invalid-unary-operand-type
new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
array_ops.concat([c, m], 1))
return m, new_state
关键行是:
c = (sigmoid(f + self._forget_bias) * c_prev + sigmoid(i) *
self._activation(j))
和
m = sigmoid(o) * self._activation(c)
和
new_state = (LSTMStateTuple(c, m)
如果将计算 c
和 m
的代码与 LSTM 方程(见下文)进行比较,您可以看到它对应于细胞状态(通常表示为 c
) 和隐藏状态(通常表示为 h
),分别为:
new_state = (LSTMStateTuple(c, m)
表示返回状态元组的第一个元素为c
(单元格状态a.k.a.c_state
),返回状态的第二个元素元组是 m
(隐藏状态 a.k.a。m_state
)。
https://github.com/tensorflow/tensorflow/blob/r1.2/tensorflow/python/ops/rnn_cell_impl.py
第 308 - 314 行
class LSTMStateTuple(_LSTMStateTuple):
"""LSTM 单元用于 state_size
、zero_state
和输出状态的元组。
按顺序存储两个元素:(c, h)
。
仅在 state_is_tuple=True
时使用。
"""