摘要直方图中的 Nan：deconv2/biases

Question

我的图像的原始大小是 3900 x 6000 x 3。我制作形状重叠的补丁 (232024, 28, 28, 3)，然后制作大小为 1000 的批次。我有一个用于语义分割的 CNN 模型如下：

def conv_layer(inputs, filters, kernel_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "conv"):
    with tf.name_scope(name):
        input_shape = inputs.shape.as_list()

        filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], input_shape[3], filters], dtype = tf.float32)

        filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
        bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")

        conv2d = tf.nn.conv2d(input = tf.cast(inputs, dtype = tf.float32), filter = filter, strides = [1, strides, strides, 1], padding = padding)

        activation = tf.nn.relu(conv2d + bias)

        tf.summary.histogram("weights", filter)
        tf.summary.histogram("biases", bias)
        tf.summary.histogram("activations", activation)

        return tf.cast(activation, dtype = tf.float16)

def deconv_layer(inputs, filters, kernel_size, output_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "deconv"):
    with tf.name_scope(name):

        input_shape = inputs.shape.as_list()
        deconv_shape = tf.stack([tf.shape(inputs)[0], output_size[0], output_size[1],filters])

        filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], filters, input_shape[3]], dtype = tf.float32)

        filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
        bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")

        print("bias:")
        print(bias)

        conv2d_transpose = tf.nn.conv2d_transpose(value = tf.cast(inputs, dtype = tf.float32), 
                                                  filter = filter, 
                                                  strides = [1, strides, strides, 1], 
                                                  output_shape=deconv_shape,
                                                  padding = padding)

        activation = tf.nn.relu(conv2d_transpose + bias)

        tf.summary.histogram("weights", filter)
        tf.summary.histogram("biases", bias)
        tf.summary.histogram("activations", activation)

        return tf.cast(activation, dtype = tf.float16)

def semantic_seg_model(features, mode, batch_size):
    bias_constant = 0.1
    conv_filters = [20, 50, 90]
    conv_sizes = []

    tf.summary.image('input', features, batch_size)

    """Model function for CNN."""

    # Encoding starts here.

    # Convolutional Layer 1
    # Input: 100 x 100
    conv = conv_layer(inputs=features,
                        filters=conv_filters[0],
                        kernel_size=[5, 5],
                        bias_constant = bias_constant,
                        name = "conv1")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)

    # Convolutional Layer 2
    # Input: 100 x 100
    conv = conv_layer(inputs = conv,
                        filters = conv_filters[1],
                        kernel_size = [5, 5],
                        strides = 2,
                        bias_constant = bias_constant,
                        name = "conv2")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)
    # Convolutional Layer 3
    # Input: 100 x 100
    conv = conv_layer(inputs = conv,
                        filters = conv_filters[2],
                        kernel_size = [5, 5],
                        bias_constant = bias_constant,
                        strides = 2,
                        name = "conv3")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)

    # Deconvolution Layer 3
    # Input: 100 x 100
    deconv = deconv_layer(inputs = conv,
                            filters = conv_filters[1],
                            kernel_size = [5, 5],
                            bias_constant = bias_constant,
                            strides = 2,
                            output_size = [conv_sizes[1][1], conv_sizes[1][2]],
                            name = "deconv3")
    print(deconv.shape)
    # Deconvolution Layer 2
    # Input: 100 x 100
    deconv = deconv_layer(inputs = deconv,
                            filters = conv_filters[0],
                            kernel_size = [5, 5],
                            bias_constant = bias_constant,
                            strides = 2,
                            output_size = [conv_sizes[0][1], conv_sizes[0][2]],
                            name = "deconv2")
    print(deconv.shape)
    deconv = deconv_layer(inputs = deconv,
                            filters = 3,
                            kernel_size = [5, 5],
                            output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
                            bias_constant = bias_constant,
                            name = "deconv1")

    print(deconv.shape)
    return deconv

epochs = 1000
learning_rate = 1e-50

image, label = tf.train.slice_input_producer([features, labels], shuffle = False)

BATCH_SIZE = 1000
THREAD_NUM = 5
MIN_AFTER_DEQUEUE = 10000
queue_capacity = MIN_AFTER_DEQUEUE + THREAD_NUM * BATCH_SIZE


image_batch, label_batch = tf.train.batch(tensors = [image, label],
                                            batch_size = BATCH_SIZE,
                                            capacity = queue_capacity,
                                            num_threads = THREAD_NUM,
                                            allow_smaller_final_batch = True)

output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE)

#cost
with tf.name_scope("cross_entropy"):
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = output, labels = label_batch)
    cost = tf.reduce_mean( cross_entropy )
#     return cost, optimizer, accr
    tf.summary.scalar("xent", cost)

#optimizer
with tf.name_scope("optimizer"):
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)

# Accuracy
with tf.name_scope("accuracy"):
    correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(output, 1))
    accr = tf.reduce_mean(tf.cast(correct_prediction, tf.float16))
    tf.summary.scalar("accuracy", accr)

merged_summary = tf.summary.merge_all()

# Session configs
config = tf.ConfigProto()
config.log_device_placement = True
config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction=0.8

# Initialize session
sess = tf.Session(config=config)

sess.run(tf.global_variables_initializer())

coord = tf.train.Coordinator()
enqueue_threads = tf.train.start_queue_runners(sess = sess, coord = coord)

try:
    for epoch in range(epochs):
        if coord.should_stop():
            break

        epoch_loss = 0
        train_loss = []; train_accuracy = []

        s = sess.run(merged_summary)
        writer.add_summary(s, epoch)

        for batch in range(math.ceil(features.shape.as_list()[0]/BATCH_SIZE)):
            _, sess_cost, sess_accuracy = sess.run([optimizer, cost, accr])

            train_loss.append(sess_cost)
            train_accuracy.append(sess_accuracy)

        train_loss = np.mean(train_loss)
        train_accuracy = np.mean(train_accuracy)

        saver.save(sess, "./semantic_seg_model_1", global_step=epoch)

        print ("[%02d/%02d] trainLoss: %.4f trainAcc: %.2f" 
           % (epoch + 1, epochs, sess_cost, sess_accuracy))

except Exception as e:
        # Report exceptions to the coordinator.
    coord.request_stop(e)

finally:
        # Terminate as usual. It is safe to call `coord.request_stop()` twice.
    coord.request_stop()
    coord.join(enqueue_threads)

sess.close()

我在开始训练课程时遇到错误。错误如下：

[01/1000] trainLoss: 0.0000 trainAcc: 1.00

INFO:tensorflow:Error reported to Coordinator: , Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

Caused by op 'deconv2/biases', defined at: File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code exec(code, run_globals) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py", line 16, in app.launch_new_instance() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance app.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", line 478, in start self.io_loop.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start super(ZMQIOLoop, self).start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", line 888, in start handler_func(fd_obj, events) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events self._handle_recv() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv self._run_callback(callback, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback callback(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 281, in dispatcher return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 232, in dispatch_shell handler(stream, idents, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 397, in execute_request user_expressions, allow_stdin) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell interactivity=interactivity, compiler=compiler, result=result) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) File "", line 107, in semantic_seg_model name = "deconv2") File "", line 78, in deconv_layer tf.summary.histogram("biases", bias) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", line 192, in histogram tag=tag, values=values, name=scope) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 187, in _histogram_summary "HistogramSummary", tag=tag, values=values, name=name) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

Number of iterations completed this epoch: 0 --------------------------------------------------------------------------- InvalidArgumentError Traceback (most recent call last) c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args) 1322 try: -> 1323 return fn(*args) 1324 except errors.OpError as e:

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata) 1301 feed_dict, fetch_list, target_list, -> 1302 status, run_metadata) 1303

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\errors_impl.py in exit(self, type_arg, value_arg, traceback_arg) 472 compat.as_text(c_api.TF_Message(self.status.status)), --> 473 c_api.TF_GetCode(self.status.status)) 474 # Delete the underlying status object from memory otherwise it stays alive

InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

During handling of the above exception, another exception occurred:

InvalidArgumentError Traceback (most recent call last) in () 40 # Terminate as usual. It is safe to call coord.request_stop() twice. 41 coord.request_stop() ---> 42 coord.join(enqueue_threads) 43 44 sess.close()

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\training\coordinator.py in join(self, threads, stop_grace_period_secs, ignore_live_threads) 387 self._registered_threads = set() 388 if self._exc_info_to_raise: --> 389 six.reraise(*self._exc_info_to_raise) 390 elif stragglers: 391 if ignore_live_threads:

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\six.py in reraise(tp, value, tb) 691 if value.traceback is not tb: 692 raise value.with_traceback(tb) --> 693 raise value 694 finally: 695 value = None

in () 13 train_loss = []; train_accuracy = [] 14 ---> 15 s = sess.run(merged_summary) 16 writer.add_summary(s, epoch) 17

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata) 887 try: 888 result = self._run(None, fetches, feed_dict, options_ptr, --> 889 run_metadata_ptr) 890 if run_metadata: 891 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
1118 if final_fetches or final_targets or (handle and feed_dict_tensor): 1119 results = self._do_run(handle, final_targets, final_fetches, -> 1120 feed_dict_tensor, options, run_metadata) 1121 else: 1122 results = []

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1315 if handle is None: 1316 return self._do_call(_run_fn, self._session, feeds, fetches, targets, -> 1317 options, run_metadata) 1318 else: 1319 return self._do_call(_prun_fn, self._session, handle, feeds, fetches)

c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args) 1334 except KeyError: 1335 pass -> 1336 raise type(e)(node_def, op, message) 1337 1338 def _extend_graph(self):

InvalidArgumentError: Nan in summary histogram for: deconv2/biases
[[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

Caused by op 'deconv2/biases', defined at: File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py", line 85, in _run_code exec(code, run_globals) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py", line 16, in app.launch_new_instance() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance app.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", line 478, in start self.io_loop.start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start super(ZMQIOLoop, self).start() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", line 888, in start handler_func(fd_obj, events) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events self._handle_recv() File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv self._run_callback(callback, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback callback(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper return fn(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 281, in dispatcher return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 232, in dispatch_shell handler(stream, idents, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", line 397, in execute_request user_expressions, allow_stdin) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell interactivity=interactivity, compiler=compiler, result=result) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2850, in run_ast_nodes if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File "", line 1, in output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) File "", line 107, in semantic_seg_model name = "deconv2") File "", line 78, in deconv_layer tf.summary.histogram("biases", bias) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", line 192, in histogram tag=tag, values=values, name=scope) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", line 187, in _histogram_summary "HistogramSummary", tag=tag, values=values, name=name) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op op_def=op_def) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in init self._traceback = self._graph._extract_stack() # pylint: disable=protected-access

InvalidArgumentError (see above for traceback): Nan in summary histogram for: deconv2/biases [[Node: deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[Node: batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]

github tensorflow issues 的某人建议尝试在模型发散时降低学习率，但这没有帮助。另一个建议应将 dtype 从 float16 更改为 float32，因为 float16 存在问题。当我将数据的 dtype 更改为 float32 时，我在 python 日志控制台中收到以下错误：

[libprotobuf ERROR C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] Exceeded maximum protobuf size of 2GB. [libprotobuf ERROR C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] Exceeded maximum protobuf size of 2GB.

当我尝试增加重叠图像块的宽度和高度时，会出现同样的错误。我也试过减少 BATCH_SIZE 但没有帮助。

我有 4GB NVIDIA GeForce GTX 960M 专用显卡和 16GB 内存以及 Intel Core i7-6700HQ CPU @ 2.60 GHz 2.60 GHz。 Python 版本为 3.6.4，Tensorflow 版本为 1.4，带 GPU。

更新 1： 更新模型：

def semantic_seg_model(features, mode, batch_size):
    bias_constant = 0.1
    conv_filters = [10, 25, 90]
    conv_sizes = []

    tf.summary.image('input', features, batch_size)

    """Model function for CNN."""

    # Encoding starts here.

    # Convolutional Layer 1
    # Input: 100 x 100
    conv = conv_layer(inputs=features,
                        filters=conv_filters[0],
                        kernel_size=[2, 2],
                        bias_constant = bias_constant,
                        name = "conv1")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)

    # Convolutional Layer 2
    # Input: 100 x 100
    conv = conv_layer(inputs = conv,
                        filters = conv_filters[1],
                        kernel_size = [2, 2],
                        bias_constant = bias_constant,
                        name = "conv2")

    conv_sizes.append(conv.shape.as_list())
    print(conv.shape)

# Deconvolution Layer 2
    # Input: 100 x 100
    deconv = deconv_layer(inputs = conv,
                            filters = conv_filters[0],
                            kernel_size = [2, 2],
                            bias_constant = bias_constant,
                            output_size = [conv_sizes[0][1], conv_sizes[0][2]],
                            name = "deconv2")
    print(deconv.shape)
    deconv = deconv_layer(inputs = deconv,
                            filters = 3,
                            kernel_size = [2, 2],
                            output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
                            bias_constant = bias_constant,
                            name = "deconv1")

    print(deconv.shape)
    return tf.cast(deconv, dtype = tf.float16)

Answer 1

我怀疑问题是你有明显的过拟合；真正的证据是：

[01/1000] trainLoss: 0.0000 trainAcc: 1.00

这表示仅经过一个时期后，您就完全适合训练数据；过度拟合的明确标志。因此，由此产生的 NaN 可能是这个问题的一个不足为奇的效果，因为您现在几乎可以肯定已经学习了权重，这些权重将 return 0 或 inf 对数据或批次没见过（因为它太过拟合了）。

要解决此问题，我建议大幅简化您的模型，直到您得到不会很快过拟合的模型；例如，更少和更小的 conv 和 deconv 层。然后，您可以开始在这种复杂性中重新构建。然后你还会发现你可能想要构建一些 dropout and/or 批量归一化来处理这种过度拟合（注意：虽然很想开始将这种复杂性添加到你现有的模型中，但我建议不要这样做；首先得到一些简单的工作，然后从那里增加复杂性......）。

最后说明：如果你按照上面的建议简化问题，你可能会有更好的minimal example分享；这应该能让我们更快地查明您的问题。

摘要直方图中的 Nan：deconv2/biases

Nan in summary histogram for: deconv2/biases

tensorflow

cudnn

tensorboard

semantic-segmentation