Tensorflow 配置文件为 Conv2D 输出 2 FLOPS 而不是 1
Tensorflow Profile outputs 2 FLOPS for a Conv2D instead of 1
我想知道是否有人知道为什么 Conv2d
操作的 FLOP 数量是 2 而不是 1。在下面的示例中,输入是具有 1 个通道的 1x1
图像批量大小为 1。卷积中的特征数也为 1,没有偏差。理想情况下,乘法次数应为 1。但 TF 分析器的输出显示 FLOP 为 2。FLOP 是否包含除乘法以外的其他内容?谢谢
示例如下:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # assuming you have a gpu0
import tensorflow as tf
from keras import backend as K
def load_pb(pb):
with tf.gfile.GFile(pb, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, name='')
return graph
def freeze_session(session, keep_var_names=None, output_names=None, clear_devices=True):
from tensorflow.python.framework.graph_util import convert_variables_to_constants
graph = session.graph
with graph.as_default():
freeze_var_names = list(set(v.op.name for v in tf.global_variables()).difference(keep_var_names or []))
output_names = output_names or []
output_names += [v.op.name for v in tf.global_variables()]
input_graph_def = graph.as_graph_def()
if clear_devices:
for node in input_graph_def.node:
node.device = ""
frozen_graph = convert_variables_to_constants(session, input_graph_def,output_names, freeze_var_names)
return frozen_graph
# define the model
inp = tf.keras.layers.Input(batch_shape=(1, 1, 1, 1), name='input')
x = tf.keras.layers.Conv2D(1, kernel_size=(1, 1), strides=(1, 1), padding='same', name='conv', use_bias=False)(inp)
out = tf.keras.layers.Flatten(name='output')(x)
model = tf.keras.models.Model(inputs=inp, outputs=out)
model.summary()
# freeze the model
output_graph_def = freeze_session(K.get_session(), output_names=[out.op.name for out in model.outputs])
with tf.gfile.GFile('graph.pb', "wb") as f:
f.write(output_graph_def.SerializeToString())
# load the protobuf and perform tf profiling
g2 = load_pb('./graph.pb')
with g2.as_default():
opts = tf.profiler.ProfileOptionBuilder.float_operation()
flops = tf.profiler.profile(g2, run_meta=tf.RunMetadata(), cmd='scope', options=opts)
print('FLOP', flops.total_float_ops)
输出为:
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input (InputLayer) (1, 1, 1, 1) 0
_________________________________________________________________
conv (Conv2D) (1, 1, 1, 1) 1
_________________________________________________________________
output (Flatten) (1, 1) 0
=================================================================
Total params: 1
Trainable params: 1
Non-trainable params: 0
_________________________________________________________________
Converted 1 variables to const ops.
Parsing Inputs...
=========================Options=============================
-max_depth 10000
-min_bytes 0
-min_peak_bytes 0
-min_residual_bytes 0
-min_output_bytes 0
-min_micros 0
-min_accelerator_micros 0
-min_cpu_micros 0
-min_params 0
-min_float_ops 1
-min_occurrence 0
-step -1
-order_by float_ops
-account_type_regexes .*
-start_name_regexes .*
-trim_name_regexes
-show_name_regexes .*
-hide_name_regexes
-account_displayed_op_only true
-select float_ops
-output stdout:
==================Model Analysis Report======================
Doc:
scope: The nodes in the model graph are organized by their names, which is hierarchical like filesystem.
flops: Number of float operations. Note: Please read the implementation for the math behind it.
Profile:
node name | # float_ops
_TFProfRoot (--/2 flops)
conv/Conv2D (2/2 flops)
======================End of Report==========================
FLOP 2
考虑与您几乎相同的设置,但卷积有 n 个通道。
然后你会有n次乘法,然后你会累加所有乘法的结果。
现在可以说,你可以用第一次乘法的结果来初始化和,然后对剩下的 (n-1) 次乘法求和。
但这将是对第一次乘法的特殊处理,相反,将和初始化为 0,然后将其与所有 n 次乘法累加相加更有意义。
特别是当 n=1 时,你会遇到一个荒谬的情况
sum = 0
mult = w1 * a1
sum = sum + mult
这将导致 2 个 FLOP,或 1 个 MAC(乘法-累加)运算。
我想知道是否有人知道为什么 Conv2d
操作的 FLOP 数量是 2 而不是 1。在下面的示例中,输入是具有 1 个通道的 1x1
图像批量大小为 1。卷积中的特征数也为 1,没有偏差。理想情况下,乘法次数应为 1。但 TF 分析器的输出显示 FLOP 为 2。FLOP 是否包含除乘法以外的其他内容?谢谢
示例如下:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0' # assuming you have a gpu0
import tensorflow as tf
from keras import backend as K
def load_pb(pb):
with tf.gfile.GFile(pb, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, name='')
return graph
def freeze_session(session, keep_var_names=None, output_names=None, clear_devices=True):
from tensorflow.python.framework.graph_util import convert_variables_to_constants
graph = session.graph
with graph.as_default():
freeze_var_names = list(set(v.op.name for v in tf.global_variables()).difference(keep_var_names or []))
output_names = output_names or []
output_names += [v.op.name for v in tf.global_variables()]
input_graph_def = graph.as_graph_def()
if clear_devices:
for node in input_graph_def.node:
node.device = ""
frozen_graph = convert_variables_to_constants(session, input_graph_def,output_names, freeze_var_names)
return frozen_graph
# define the model
inp = tf.keras.layers.Input(batch_shape=(1, 1, 1, 1), name='input')
x = tf.keras.layers.Conv2D(1, kernel_size=(1, 1), strides=(1, 1), padding='same', name='conv', use_bias=False)(inp)
out = tf.keras.layers.Flatten(name='output')(x)
model = tf.keras.models.Model(inputs=inp, outputs=out)
model.summary()
# freeze the model
output_graph_def = freeze_session(K.get_session(), output_names=[out.op.name for out in model.outputs])
with tf.gfile.GFile('graph.pb', "wb") as f:
f.write(output_graph_def.SerializeToString())
# load the protobuf and perform tf profiling
g2 = load_pb('./graph.pb')
with g2.as_default():
opts = tf.profiler.ProfileOptionBuilder.float_operation()
flops = tf.profiler.profile(g2, run_meta=tf.RunMetadata(), cmd='scope', options=opts)
print('FLOP', flops.total_float_ops)
输出为:
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input (InputLayer) (1, 1, 1, 1) 0
_________________________________________________________________
conv (Conv2D) (1, 1, 1, 1) 1
_________________________________________________________________
output (Flatten) (1, 1) 0
=================================================================
Total params: 1
Trainable params: 1
Non-trainable params: 0
_________________________________________________________________
Converted 1 variables to const ops.
Parsing Inputs...
=========================Options=============================
-max_depth 10000
-min_bytes 0
-min_peak_bytes 0
-min_residual_bytes 0
-min_output_bytes 0
-min_micros 0
-min_accelerator_micros 0
-min_cpu_micros 0
-min_params 0
-min_float_ops 1
-min_occurrence 0
-step -1
-order_by float_ops
-account_type_regexes .*
-start_name_regexes .*
-trim_name_regexes
-show_name_regexes .*
-hide_name_regexes
-account_displayed_op_only true
-select float_ops
-output stdout:
==================Model Analysis Report======================
Doc:
scope: The nodes in the model graph are organized by their names, which is hierarchical like filesystem.
flops: Number of float operations. Note: Please read the implementation for the math behind it.
Profile:
node name | # float_ops
_TFProfRoot (--/2 flops)
conv/Conv2D (2/2 flops)
======================End of Report==========================
FLOP 2
考虑与您几乎相同的设置,但卷积有 n 个通道。 然后你会有n次乘法,然后你会累加所有乘法的结果。 现在可以说,你可以用第一次乘法的结果来初始化和,然后对剩下的 (n-1) 次乘法求和。 但这将是对第一次乘法的特殊处理,相反,将和初始化为 0,然后将其与所有 n 次乘法累加相加更有意义。 特别是当 n=1 时,你会遇到一个荒谬的情况
sum = 0
mult = w1 * a1
sum = sum + mult
这将导致 2 个 FLOP,或 1 个 MAC(乘法-累加)运算。