摘要直方图中的 Nan:deconv2/biases
Nan in summary histogram for: deconv2/biases
我的图像的原始大小是 3900 x 6000 x 3。我制作形状重叠的补丁 (232024, 28, 28, 3),然后制作大小为 1000 的批次。我有一个用于语义分割的 CNN 模型如下:
def conv_layer(inputs, filters, kernel_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "conv"):
with tf.name_scope(name):
input_shape = inputs.shape.as_list()
filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], input_shape[3], filters], dtype = tf.float32)
filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")
conv2d = tf.nn.conv2d(input = tf.cast(inputs, dtype = tf.float32), filter = filter, strides = [1, strides, strides, 1], padding = padding)
activation = tf.nn.relu(conv2d + bias)
tf.summary.histogram("weights", filter)
tf.summary.histogram("biases", bias)
tf.summary.histogram("activations", activation)
return tf.cast(activation, dtype = tf.float16)
def deconv_layer(inputs, filters, kernel_size, output_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "deconv"):
with tf.name_scope(name):
input_shape = inputs.shape.as_list()
deconv_shape = tf.stack([tf.shape(inputs)[0], output_size[0], output_size[1],filters])
filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], filters, input_shape[3]], dtype = tf.float32)
filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")
print("bias:")
print(bias)
conv2d_transpose = tf.nn.conv2d_transpose(value = tf.cast(inputs, dtype = tf.float32),
filter = filter,
strides = [1, strides, strides, 1],
output_shape=deconv_shape,
padding = padding)
activation = tf.nn.relu(conv2d_transpose + bias)
tf.summary.histogram("weights", filter)
tf.summary.histogram("biases", bias)
tf.summary.histogram("activations", activation)
return tf.cast(activation, dtype = tf.float16)
def semantic_seg_model(features, mode, batch_size):
bias_constant = 0.1
conv_filters = [20, 50, 90]
conv_sizes = []
tf.summary.image('input', features, batch_size)
"""Model function for CNN."""
# Encoding starts here.
# Convolutional Layer 1
# Input: 100 x 100
conv = conv_layer(inputs=features,
filters=conv_filters[0],
kernel_size=[5, 5],
bias_constant = bias_constant,
name = "conv1")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 2
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [5, 5],
strides = 2,
bias_constant = bias_constant,
name = "conv2")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 3
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[2],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
name = "conv3")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Deconvolution Layer 3
# Input: 100 x 100
deconv = deconv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
output_size = [conv_sizes[1][1], conv_sizes[1][2]],
name = "deconv3")
print(deconv.shape)
# Deconvolution Layer 2
# Input: 100 x 100
deconv = deconv_layer(inputs = deconv,
filters = conv_filters[0],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
output_size = [conv_sizes[0][1], conv_sizes[0][2]],
name = "deconv2")
print(deconv.shape)
deconv = deconv_layer(inputs = deconv,
filters = 3,
kernel_size = [5, 5],
output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
bias_constant = bias_constant,
name = "deconv1")
print(deconv.shape)
return deconv
epochs = 1000
learning_rate = 1e-50
image, label = tf.train.slice_input_producer([features, labels], shuffle = False)
BATCH_SIZE = 1000
THREAD_NUM = 5
MIN_AFTER_DEQUEUE = 10000
queue_capacity = MIN_AFTER_DEQUEUE + THREAD_NUM * BATCH_SIZE
image_batch, label_batch = tf.train.batch(tensors = [image, label],
batch_size = BATCH_SIZE,
capacity = queue_capacity,
num_threads = THREAD_NUM,
allow_smaller_final_batch = True)
output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE)
#cost
with tf.name_scope("cross_entropy"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = output, labels = label_batch)
cost = tf.reduce_mean( cross_entropy )
# return cost, optimizer, accr
tf.summary.scalar("xent", cost)
#optimizer
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
# Accuracy
with tf.name_scope("accuracy"):
correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(output, 1))
accr = tf.reduce_mean(tf.cast(correct_prediction, tf.float16))
tf.summary.scalar("accuracy", accr)
merged_summary = tf.summary.merge_all()
# Session configs
config = tf.ConfigProto()
config.log_device_placement = True
config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction=0.8
# Initialize session
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
enqueue_threads = tf.train.start_queue_runners(sess = sess, coord = coord)
try:
for epoch in range(epochs):
if coord.should_stop():
break
epoch_loss = 0
train_loss = []; train_accuracy = []
s = sess.run(merged_summary)
writer.add_summary(s, epoch)
for batch in range(math.ceil(features.shape.as_list()[0]/BATCH_SIZE)):
_, sess_cost, sess_accuracy = sess.run([optimizer, cost, accr])
train_loss.append(sess_cost)
train_accuracy.append(sess_accuracy)
train_loss = np.mean(train_loss)
train_accuracy = np.mean(train_accuracy)
saver.save(sess, "./semantic_seg_model_1", global_step=epoch)
print ("[%02d/%02d] trainLoss: %.4f trainAcc: %.2f"
% (epoch + 1, epochs, sess_cost, sess_accuracy))
except Exception as e:
# Report exceptions to the coordinator.
coord.request_stop(e)
finally:
# Terminate as usual. It is safe to call `coord.request_stop()` twice.
coord.request_stop()
coord.join(enqueue_threads)
sess.close()
我在开始训练课程时遇到错误。错误如下:
[01/1000] trainLoss: 0.0000 trainAcc: 1.00
INFO:tensorflow:Error reported to Coordinator: , Nan
in summary histogram for: deconv2/biases [[Node: deconv2/biases =
HistogramSummary[T=DT_FLOAT,
_device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag,
deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 =
_Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0",
send_device="/job:localhost/replica:0/task:0/device:CPU:0",
send_device_incarnation=1,
tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32,
_device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Caused by op 'deconv2/biases', defined at: File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\runpy.py", line 193,
in _run_module_as_main
"main", mod_spec) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 85,
in _run_code
exec(code, run_globals) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py",
line 16, in
app.launch_new_instance() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py",
line 658, in launch_instance
app.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py",
line 478, in start
self.io_loop.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py",
line 177, in start
super(ZMQIOLoop, self).start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py",
line 888, in start
handler_func(fd_obj, events) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py",
line 277, in null_wrapper
return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py",
line 440, in _handle_events
self._handle_recv() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py",
line 472, in _handle_recv
self._run_callback(callback, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py",
line 414, in _run_callback
callback(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py",
line 277, in null_wrapper
return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py",
line 281, in dispatcher
return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py",
line 232, in dispatch_shell
handler(stream, idents, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py",
line 397, in execute_request
user_expressions, allow_stdin) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py",
line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent) File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py",
line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py",
line 2728, in run_cell
interactivity=interactivity, compiler=compiler, result=result) File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py",
line 2850, in run_ast_nodes
if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py",
line 2910, in run_code
exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in
output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) File
"", line 107, in semantic_seg_model
name = "deconv2") File "", line 78, in deconv_layer
tf.summary.histogram("biases", bias) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py",
line 192, in histogram
tag=tag, values=values, name=scope) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py",
line 187, in _histogram_summary
"HistogramSummary", tag=tag, values=values, name=name) File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py",
line 787, in _apply_op_helper
op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py",
line 2956, in create_op
op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py",
line 1470, in init
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Nan in summary
histogram for: deconv2/biases [[Node: deconv2/biases =
HistogramSummary[T=DT_FLOAT,
_device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag,
deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 =
_Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0",
send_device="/job:localhost/replica:0/task:0/device:CPU:0",
send_device_incarnation=1,
tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32,
_device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Number of iterations completed this epoch: 0
--------------------------------------------------------------------------- InvalidArgumentError Traceback (most recent call
last) c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py
in _do_call(self, fn, *args) 1322 try:
-> 1323 return fn(*args) 1324 except errors.OpError as e:
c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py
in _run_fn(session, feed_dict, fetch_list, target_list, options,
run_metadata) 1301 feed_dict,
fetch_list, target_list,
-> 1302 status, run_metadata) 1303
c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\errors_impl.py
in exit(self, type_arg, value_arg, traceback_arg)
472 compat.as_text(c_api.TF_Message(self.status.status)),
--> 473 c_api.TF_GetCode(self.status.status))
474 # Delete the underlying status object from memory otherwise it stays alive
InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT,
_device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag,
deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 =
_Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0",
send_device="/job:localhost/replica:0/task:0/device:CPU:0",
send_device_incarnation=1,
tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32,
_device="/job:localhost/replica:0/task:0/device:GPU:0"]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call
last) in ()
40 # Terminate as usual. It is safe to call coord.request_stop()
twice.
41 coord.request_stop()
---> 42 coord.join(enqueue_threads)
43
44 sess.close()
c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\training\coordinator.py
in join(self, threads, stop_grace_period_secs, ignore_live_threads)
387 self._registered_threads = set()
388 if self._exc_info_to_raise:
--> 389 six.reraise(*self._exc_info_to_raise)
390 elif stragglers:
391 if ignore_live_threads:
c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\six.py
in reraise(tp, value, tb)
691 if value.traceback is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
in ()
13 train_loss = []; train_accuracy = []
14
---> 15 s = sess.run(merged_summary)
16 writer.add_summary(s, epoch)
17
c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py
in run(self, fetches, feed_dict, options, run_metadata)
887 try:
888 result = self._run(None, fetches, feed_dict, options_ptr,
--> 889 run_metadata_ptr)
890 if run_metadata:
891 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py
in _run(self, handle, fetches, feed_dict, options, run_metadata)
1118 if final_fetches or final_targets or (handle and
feed_dict_tensor): 1119 results = self._do_run(handle,
final_targets, final_fetches,
-> 1120 feed_dict_tensor, options, run_metadata) 1121 else: 1122 results = []
c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py
in _do_run(self, handle, target_list, fetch_list, feed_dict, options,
run_metadata) 1315 if handle is None: 1316 return
self._do_call(_run_fn, self._session, feeds, fetches, targets,
-> 1317 options, run_metadata) 1318 else: 1319 return self._do_call(_prun_fn, self._session,
handle, feeds, fetches)
c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py
in _do_call(self, fn, *args) 1334 except KeyError: 1335
pass
-> 1336 raise type(e)(node_def, op, message) 1337 1338 def _extend_graph(self):
InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT,
_device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag,
deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 =
_Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0",
send_device="/job:localhost/replica:0/task:0/device:CPU:0",
send_device_incarnation=1,
tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32,
_device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Caused by op 'deconv2/biases', defined at: File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\runpy.py", line 193,
in _run_module_as_main
"main", mod_spec) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 85,
in _run_code
exec(code, run_globals) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py",
line 16, in
app.launch_new_instance() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py",
line 658, in launch_instance
app.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py",
line 478, in start
self.io_loop.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py",
line 177, in start
super(ZMQIOLoop, self).start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py",
line 888, in start
handler_func(fd_obj, events) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py",
line 277, in null_wrapper
return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py",
line 440, in _handle_events
self._handle_recv() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py",
line 472, in _handle_recv
self._run_callback(callback, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py",
line 414, in _run_callback
callback(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py",
line 277, in null_wrapper
return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py",
line 281, in dispatcher
return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py",
line 232, in dispatch_shell
handler(stream, idents, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py",
line 397, in execute_request
user_expressions, allow_stdin) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py",
line 208, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent) File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py",
line 533, in run_cell
return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py",
line 2728, in run_cell
interactivity=interactivity, compiler=compiler, result=result) File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py",
line 2850, in run_ast_nodes
if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py",
line 2910, in run_code
exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in
output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) File
"", line 107, in semantic_seg_model
name = "deconv2") File "", line 78, in deconv_layer
tf.summary.histogram("biases", bias) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py",
line 192, in histogram
tag=tag, values=values, name=scope) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py",
line 187, in _histogram_summary
"HistogramSummary", tag=tag, values=values, name=name) File "c:\users\fawad
khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py",
line 787, in _apply_op_helper
op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py",
line 2956, in create_op
op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py",
line 1470, in init
self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Nan in summary
histogram for: deconv2/biases [[Node: deconv2/biases =
HistogramSummary[T=DT_FLOAT,
_device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag,
deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 =
_Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0",
send_device="/job:localhost/replica:0/task:0/device:CPU:0",
send_device_incarnation=1,
tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32,
_device="/job:localhost/replica:0/task:0/device:GPU:0"]]
github tensorflow issues 的某人建议尝试在模型发散时降低学习率,但这没有帮助。另一个建议应将 dtype 从 float16 更改为 float32,因为 float16 存在问题。当我将数据的 dtype 更改为 float32 时,我在 python 日志控制台中收到以下错误:
[libprotobuf ERROR
C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297]
Exceeded maximum protobuf size of 2GB. [libprotobuf ERROR
C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297]
Exceeded maximum protobuf size of 2GB.
当我尝试增加重叠图像块的宽度和高度时,会出现同样的错误。我也试过减少 BATCH_SIZE 但没有帮助。
我有 4GB NVIDIA GeForce GTX 960M 专用显卡和 16GB 内存以及 Intel Core i7-6700HQ CPU @ 2.60 GHz 2.60 GHz。 Python 版本为 3.6.4,Tensorflow 版本为 1.4,带 GPU。
更新 1:
更新模型:
def semantic_seg_model(features, mode, batch_size):
bias_constant = 0.1
conv_filters = [10, 25, 90]
conv_sizes = []
tf.summary.image('input', features, batch_size)
"""Model function for CNN."""
# Encoding starts here.
# Convolutional Layer 1
# Input: 100 x 100
conv = conv_layer(inputs=features,
filters=conv_filters[0],
kernel_size=[2, 2],
bias_constant = bias_constant,
name = "conv1")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 2
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [2, 2],
bias_constant = bias_constant,
name = "conv2")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Deconvolution Layer 2
# Input: 100 x 100
deconv = deconv_layer(inputs = conv,
filters = conv_filters[0],
kernel_size = [2, 2],
bias_constant = bias_constant,
output_size = [conv_sizes[0][1], conv_sizes[0][2]],
name = "deconv2")
print(deconv.shape)
deconv = deconv_layer(inputs = deconv,
filters = 3,
kernel_size = [2, 2],
output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
bias_constant = bias_constant,
name = "deconv1")
print(deconv.shape)
return tf.cast(deconv, dtype = tf.float16)
我怀疑问题是你有明显的过拟合;真正的证据是:
[01/1000] trainLoss: 0.0000 trainAcc: 1.00
这表示仅经过一个时期后,您就完全适合训练数据;过度拟合的明确标志。因此,由此产生的 NaN
可能是这个问题的一个不足为奇的效果,因为您现在几乎可以肯定已经学习了权重,这些权重将 return 0
或 inf
对数据或批次没见过(因为它太过拟合了)。
要解决此问题,我建议大幅简化您的模型,直到您得到不会很快过拟合的模型;例如,更少和更小的 conv 和 deconv 层。然后,您可以开始在这种复杂性中重新构建。然后你还会发现你可能想要构建一些 dropout and/or 批量归一化来处理这种过度拟合(注意:虽然很想开始将这种复杂性添加到你现有的模型中,但我建议不要这样做;首先得到一些简单的工作,然后从那里增加复杂性......)。
最后说明:如果你按照上面的建议简化问题,你可能会有更好的minimal example分享;这应该能让我们更快地查明您的问题。
我的图像的原始大小是 3900 x 6000 x 3。我制作形状重叠的补丁 (232024, 28, 28, 3),然后制作大小为 1000 的批次。我有一个用于语义分割的 CNN 模型如下:
def conv_layer(inputs, filters, kernel_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "conv"):
with tf.name_scope(name):
input_shape = inputs.shape.as_list()
filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], input_shape[3], filters], dtype = tf.float32)
filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")
conv2d = tf.nn.conv2d(input = tf.cast(inputs, dtype = tf.float32), filter = filter, strides = [1, strides, strides, 1], padding = padding)
activation = tf.nn.relu(conv2d + bias)
tf.summary.histogram("weights", filter)
tf.summary.histogram("biases", bias)
tf.summary.histogram("activations", activation)
return tf.cast(activation, dtype = tf.float16)
def deconv_layer(inputs, filters, kernel_size, output_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "deconv"):
with tf.name_scope(name):
input_shape = inputs.shape.as_list()
deconv_shape = tf.stack([tf.shape(inputs)[0], output_size[0], output_size[1],filters])
filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], filters, input_shape[3]], dtype = tf.float32)
filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")
print("bias:")
print(bias)
conv2d_transpose = tf.nn.conv2d_transpose(value = tf.cast(inputs, dtype = tf.float32),
filter = filter,
strides = [1, strides, strides, 1],
output_shape=deconv_shape,
padding = padding)
activation = tf.nn.relu(conv2d_transpose + bias)
tf.summary.histogram("weights", filter)
tf.summary.histogram("biases", bias)
tf.summary.histogram("activations", activation)
return tf.cast(activation, dtype = tf.float16)
def semantic_seg_model(features, mode, batch_size):
bias_constant = 0.1
conv_filters = [20, 50, 90]
conv_sizes = []
tf.summary.image('input', features, batch_size)
"""Model function for CNN."""
# Encoding starts here.
# Convolutional Layer 1
# Input: 100 x 100
conv = conv_layer(inputs=features,
filters=conv_filters[0],
kernel_size=[5, 5],
bias_constant = bias_constant,
name = "conv1")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 2
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [5, 5],
strides = 2,
bias_constant = bias_constant,
name = "conv2")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 3
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[2],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
name = "conv3")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Deconvolution Layer 3
# Input: 100 x 100
deconv = deconv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
output_size = [conv_sizes[1][1], conv_sizes[1][2]],
name = "deconv3")
print(deconv.shape)
# Deconvolution Layer 2
# Input: 100 x 100
deconv = deconv_layer(inputs = deconv,
filters = conv_filters[0],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
output_size = [conv_sizes[0][1], conv_sizes[0][2]],
name = "deconv2")
print(deconv.shape)
deconv = deconv_layer(inputs = deconv,
filters = 3,
kernel_size = [5, 5],
output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
bias_constant = bias_constant,
name = "deconv1")
print(deconv.shape)
return deconv
epochs = 1000
learning_rate = 1e-50
image, label = tf.train.slice_input_producer([features, labels], shuffle = False)
BATCH_SIZE = 1000
THREAD_NUM = 5
MIN_AFTER_DEQUEUE = 10000
queue_capacity = MIN_AFTER_DEQUEUE + THREAD_NUM * BATCH_SIZE
image_batch, label_batch = tf.train.batch(tensors = [image, label],
batch_size = BATCH_SIZE,
capacity = queue_capacity,
num_threads = THREAD_NUM,
allow_smaller_final_batch = True)
output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE)
#cost
with tf.name_scope("cross_entropy"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = output, labels = label_batch)
cost = tf.reduce_mean( cross_entropy )
# return cost, optimizer, accr
tf.summary.scalar("xent", cost)
#optimizer
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
# Accuracy
with tf.name_scope("accuracy"):
correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(output, 1))
accr = tf.reduce_mean(tf.cast(correct_prediction, tf.float16))
tf.summary.scalar("accuracy", accr)
merged_summary = tf.summary.merge_all()
# Session configs
config = tf.ConfigProto()
config.log_device_placement = True
config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction=0.8
# Initialize session
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
enqueue_threads = tf.train.start_queue_runners(sess = sess, coord = coord)
try:
for epoch in range(epochs):
if coord.should_stop():
break
epoch_loss = 0
train_loss = []; train_accuracy = []
s = sess.run(merged_summary)
writer.add_summary(s, epoch)
for batch in range(math.ceil(features.shape.as_list()[0]/BATCH_SIZE)):
_, sess_cost, sess_accuracy = sess.run([optimizer, cost, accr])
train_loss.append(sess_cost)
train_accuracy.append(sess_accuracy)
train_loss = np.mean(train_loss)
train_accuracy = np.mean(train_accuracy)
saver.save(sess, "./semantic_seg_model_1", global_step=epoch)
print ("[%02d/%02d] trainLoss: %.4f trainAcc: %.2f"
% (epoch + 1, epochs, sess_cost, sess_accuracy))
except Exception as e:
# Report exceptions to the coordinator.
coord.request_stop(e)
finally:
# Terminate as usual. It is safe to call `coord.request_stop()` twice.
coord.request_stop()
coord.join(enqueue_threads)
sess.close()
我在开始训练课程时遇到错误。错误如下:
[01/1000] trainLoss: 0.0000 trainAcc: 1.00
INFO:tensorflow:Error reported to Coordinator: , Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Caused by op 'deconv2/biases', defined at: File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code exec(code, run_globals) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py", line 16, in app.launch_new_instance() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance app.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", line 478, in start self.io_loop.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start super(ZMQIOLoop, self).start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", line 888, in start handler_func(fd_obj, events) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events self._handle_recv() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv self._run_callback(callback, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback callback(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 281, in dispatcher return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 232, in dispatch_shell handler(stream, idents, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 397, in execute_request user_expressions, allow_stdin) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell interactivity=interactivity, compiler=compiler, result=result) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) File "", line 107, in semantic_seg_model name = "deconv2") File "", line 78, in deconv_layer tf.summary.histogram("biases", bias) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", line 192, in histogram tag=tag, values=values, name=scope) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 187, in _histogram_summary "HistogramSummary", tag=tag, values=values, name=name) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
Number of iterations completed this epoch: 0 --------------------------------------------------------------------------- InvalidArgumentError Traceback (most recent call last) c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args) 1322 try: -> 1323 return fn(*args) 1324 except errors.OpError as e:
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata) 1301 feed_dict, fetch_list, target_list, -> 1302 status, run_metadata) 1303
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\errors_impl.py in exit(self, type_arg, value_arg, traceback_arg) 472 compat.as_text(c_api.TF_Message(self.status.status)), --> 473 c_api.TF_GetCode(self.status.status)) 474 # Delete the underlying status object from memory otherwise it stays alive
InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last) in () 40 # Terminate as usual. It is safe to call
coord.request_stop()
twice. 41 coord.request_stop() ---> 42 coord.join(enqueue_threads) 43 44 sess.close()c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\training\coordinator.py in join(self, threads, stop_grace_period_secs, ignore_live_threads) 387 self._registered_threads = set() 388 if self._exc_info_to_raise: --> 389 six.reraise(*self._exc_info_to_raise) 390 elif stragglers: 391 if ignore_live_threads:
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\six.py in reraise(tp, value, tb) 691 if value.traceback is not tb: 692 raise value.with_traceback(tb) --> 693 raise value 694 finally: 695 value = None
in () 13 train_loss = []; train_accuracy = [] 14 ---> 15 s = sess.run(merged_summary) 16 writer.add_summary(s, epoch) 17
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata) 887 try: 888 result = self._run(None, fetches, feed_dict, options_ptr, --> 889 run_metadata_ptr) 890 if run_metadata: 891 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1118 if final_fetches or final_targets or (handle and feed_dict_tensor): 1119 results = self._do_run(handle, final_targets, final_fetches, -> 1120 feed_dict_tensor, options, run_metadata) 1121 else: 1122 results = []c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1315 if handle is None: 1316 return self._do_call(_run_fn, self._session, feeds, fetches, targets, -> 1317 options, run_metadata) 1318 else: 1319 return self._do_call(_prun_fn, self._session, handle, feeds, fetches)
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args) 1334 except KeyError: 1335 pass -> 1336 raise type(e)(node_def, op, message) 1337 1338 def _extend_graph(self):
InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]Caused by op 'deconv2/biases', defined at: File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code exec(code, run_globals) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py", line 16, in app.launch_new_instance() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance app.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", line 478, in start self.io_loop.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start super(ZMQIOLoop, self).start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", line 888, in start handler_func(fd_obj, events) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events self._handle_recv() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv self._run_callback(callback, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback callback(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 281, in dispatcher return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 232, in dispatch_shell handler(stream, idents, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 397, in execute_request user_expressions, allow_stdin) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell interactivity=interactivity, compiler=compiler, result=result) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) File "", line 107, in semantic_seg_model name = "deconv2") File "", line 78, in deconv_layer tf.summary.histogram("biases", bias) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", line 192, in histogram tag=tag, values=values, name=scope) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 187, in _histogram_summary "HistogramSummary", tag=tag, values=values, name=name) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError (see above for traceback): Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
github tensorflow issues 的某人建议尝试在模型发散时降低学习率,但这没有帮助。另一个建议应将 dtype 从 float16 更改为 float32,因为 float16 存在问题。当我将数据的 dtype 更改为 float32 时,我在 python 日志控制台中收到以下错误:
[libprotobuf ERROR C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] Exceeded maximum protobuf size of 2GB. [libprotobuf ERROR C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] Exceeded maximum protobuf size of 2GB.
当我尝试增加重叠图像块的宽度和高度时,会出现同样的错误。我也试过减少 BATCH_SIZE 但没有帮助。
我有 4GB NVIDIA GeForce GTX 960M 专用显卡和 16GB 内存以及 Intel Core i7-6700HQ CPU @ 2.60 GHz 2.60 GHz。 Python 版本为 3.6.4,Tensorflow 版本为 1.4,带 GPU。
更新 1: 更新模型:
def semantic_seg_model(features, mode, batch_size):
bias_constant = 0.1
conv_filters = [10, 25, 90]
conv_sizes = []
tf.summary.image('input', features, batch_size)
"""Model function for CNN."""
# Encoding starts here.
# Convolutional Layer 1
# Input: 100 x 100
conv = conv_layer(inputs=features,
filters=conv_filters[0],
kernel_size=[2, 2],
bias_constant = bias_constant,
name = "conv1")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 2
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [2, 2],
bias_constant = bias_constant,
name = "conv2")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Deconvolution Layer 2
# Input: 100 x 100
deconv = deconv_layer(inputs = conv,
filters = conv_filters[0],
kernel_size = [2, 2],
bias_constant = bias_constant,
output_size = [conv_sizes[0][1], conv_sizes[0][2]],
name = "deconv2")
print(deconv.shape)
deconv = deconv_layer(inputs = deconv,
filters = 3,
kernel_size = [2, 2],
output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
bias_constant = bias_constant,
name = "deconv1")
print(deconv.shape)
return tf.cast(deconv, dtype = tf.float16)
我怀疑问题是你有明显的过拟合;真正的证据是:
[01/1000] trainLoss: 0.0000 trainAcc: 1.00
这表示仅经过一个时期后,您就完全适合训练数据;过度拟合的明确标志。因此,由此产生的 NaN
可能是这个问题的一个不足为奇的效果,因为您现在几乎可以肯定已经学习了权重,这些权重将 return 0
或 inf
对数据或批次没见过(因为它太过拟合了)。
要解决此问题,我建议大幅简化您的模型,直到您得到不会很快过拟合的模型;例如,更少和更小的 conv 和 deconv 层。然后,您可以开始在这种复杂性中重新构建。然后你还会发现你可能想要构建一些 dropout and/or 批量归一化来处理这种过度拟合(注意:虽然很想开始将这种复杂性添加到你现有的模型中,但我建议不要这样做;首先得到一些简单的工作,然后从那里增加复杂性......)。
最后说明:如果你按照上面的建议简化问题,你可能会有更好的minimal example分享;这应该能让我们更快地查明您的问题。