TF OD 2 中的训练过早结束 API
Premature end of training in TF OD 2 API
这些天我一直在玩 Tensorflow Object Detection API 2(TF OD 2),我正在使用 git head commit ce3b7227. My aim is to find the most suitable model for my custom dataset, by using the existent DL Architecture present in theTensorFlow 2 Model Zoo. I've generated my TF Records with the following tutorial of Roboflow 并且我有一直在用我的笔记本电脑和 Google Colab 在 GPU 模式下训练它。
我发现这个很棒Roboflow's Colab Notebook, while I've tried to reproduce the same steps with my dataset, by using the models/research/object_detection/model_main_tf2.py, unluckly for me, the training script always ends before it started to iterate. It didn't show any Python Error and also it show some warnings as usual. The complete output is in my Colab Notebook
我正在使用以下命令微调模型。
PIPELINE_CONFIG_PATH=models/ssd_resnet152_v1_fpn_640x640_coco17_tpu-8/pipeline.config; MODEL_DIR=training/; NUM_TRAIN_STEPS=10000; SAMPLE_1_OF_N_EVAL_EXAMPLES=1;
python models/research/object_detection/model_main_tf2.py --model_dir=$MODEL_DIR --num_train_steps=$NUM_TRAIN_STEPS --sample_1_of_n_eval_examples=$SAMPLE_1_OF_N_EVAL_EXAMPLES --pipeline_config_path=$PIPELINE_CONFIG_PATH --alsologtostderr
这是我的 pipeline.config 文件
model {
ssd {
num_classes: 90
image_resizer {
fixed_shape_resizer {
height: 640
width: 640
}
}
feature_extractor {
type: "ssd_resnet152_v1_fpn_keras"
depth_multiplier: 1.0
min_depth: 16
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 0.00039999998989515007
}
}
initializer {
truncated_normal_initializer {
mean: 0.0
stddev: 0.029999999329447746
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
override_base_feature_extractor_hyperparams: true
fpn {
min_level: 3
max_level: 7
}
}
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 0.00039999998989515007
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.009999999776482582
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
depth: 256
num_layers_before_predictor: 4
kernel_size: 3
class_prediction_bias_init: -4.599999904632568
}
}
anchor_generator {
multiscale_anchor_generator {
min_level: 3
max_level: 7
anchor_scale: 4.0
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
scales_per_octave: 2
}
}
post_processing {
batch_non_max_suppression {
score_threshold: 9.99999993922529e-09
iou_threshold: 0.6000000238418579
max_detections_per_class: 100
max_total_detections: 100
use_static_shapes: false
}
score_converter: SIGMOID
}
normalize_loss_by_num_matches: true
loss {
localization_loss {
weighted_smooth_l1 {
}
}
classification_loss {
weighted_sigmoid_focal {
gamma: 2.0
alpha: 0.25
}
}
classification_weight: 1.0
localization_weight: 1.0
}
encode_background_as_zeros: true
normalize_loc_loss_by_codesize: true
inplace_batchnorm_update: true
freeze_batchnorm: false
}
}
train_config {
batch_size: 8
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_crop_image {
min_object_covered: 0.0
min_aspect_ratio: 0.75
max_aspect_ratio: 3.0
min_area: 0.75
max_area: 1.0
overlap_thresh: 0.0
}
}
sync_replicas: true
optimizer {
momentum_optimizer {
learning_rate {
cosine_decay_learning_rate {
learning_rate_base: 0.03999999910593033
total_steps: 25000
warmup_learning_rate: 0.013333000242710114
warmup_steps: 2000
}
}
momentum_optimizer_value: 0.8999999761581421
}
use_moving_average: false
}
fine_tune_checkpoint_version: V2
fine_tune_checkpoint: "models/ssd_resnet152_v1_fpn_640x640_coco17_tpu-8/checkpoint/ckpt-0"
num_steps: 25000
startup_delay_steps: 0.0
replicas_to_aggregate: 8
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_type: "classification"
use_bfloat16: true
}
train_input_reader {
label_map_path: "datasets/UrbanTracker/urban_tracker_label_map.pbtxt"
tf_record_input_reader {
input_path: "datasets/UrbanTracker/urban_tracker_train.record"
}
}
eval_config {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
}
eval_input_reader {
label_map_path: "datasets/UrbanTracker/urban_tracker_label_map.pbtxt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "datasets/UrbanTracker/urban_tracker_test.record"
}
}
这就是我的模型目录的样子。
.
├── datasets
│ ├── raccoon
│ │ ├── raccoon_label_map.pbtxt
│ │ ├── raccoon_test.record
│ │ └── raccoon_train.record
│ ├── readme.md
│ └── UrbanTracker
│ ├── labels_urbantracker.txt
│ ├── urban_tracker_label_map.pbtxt
│ ├── urban_tracker_test.record
│ └── urban_tracker_train.record
├── __main__.py
├── models
│ ├── AUTHORS
│ ├── efficientdet_d1_coco17_tpu-32
│ │ ├── checkpoint
│ │ │ ├── checkpoint
│ │ │ ├── ckpt-0.data-00000-of-00001
│ │ │ └── ckpt-0.index
│ │ ├── pipeline.config
│ │ ├── saved_model
│ │ │ ├── assets
│ │ │ ├── saved_model.pb
│ │ │ └── variables
│ │ │ ├── variables.data-00000-of-00001
│ │ │ └── variables.index
│ ├── faster_rcnn_resnet101_v1_640x640_coco17_tpu-8
│ │ ├── checkpoint
│ │ │ ├── checkpoint
│ │ │ ├── ckpt-0.data-00000-of-00001
│ │ │ └── ckpt-0.index
│ │ ├── pipeline.config
│ │ ├── saved_model
│ │ │ ├── saved_model.pb
│ │ │ └── variables
│ │ │ ├── variables.data-00000-of-00001
│ │ │ └── variables.index
│ └── ssd_resnet152_v1_fpn_640x640_coco17_tpu-8
│ ├── checkpoint
│ │ ├── checkpoint
│ │ ├── ckpt-0.data-00000-of-00001
│ │ └── ckpt-0.index
│ ├── pipeline.config
│ ├── saved_model
│ │ ├── assets
│ │ ├── saved_model.pb
│ │ └── variables
│ │ ├── variables.data-00000-of-00001
│ │ └── variables.index
├── tools
│ ├── parse_polytrack.py
│ ├── polytrack_csv_to_tfrecord.py
│ ├── raccoon_labels_test.csv
│ ├── raccoon_labels_train.csv
│ ├── split_dataset.py
│ ├── urban_tracker_test.csv
│ └── urban_tracker_train.csv
我已经使用 TF v1 和 v2 API 将我的数据集转换为 TFRecord。另外,我一直在玩不同的训练参数,但运气不佳。为了检查我的数据集,以防我错误地生成它,我尝试了另一个数据集,基本的 Raccoon Dataset 但我得到了相同的结果。
感谢您的关注。
已解决:对于 efficientdet_d1_coco17_tpu-32 等模型,只需更改 pipeline.config 中的参数 fine_tune_checkpoint_type: "classification"
到 fine_tune_checkpoint_type: "detection"
,检查 TF Github
这些天我一直在玩 Tensorflow Object Detection API 2(TF OD 2),我正在使用 git head commit ce3b7227. My aim is to find the most suitable model for my custom dataset, by using the existent DL Architecture present in theTensorFlow 2 Model Zoo. I've generated my TF Records with the following tutorial of Roboflow 并且我有一直在用我的笔记本电脑和 Google Colab 在 GPU 模式下训练它。
我发现这个很棒Roboflow's Colab Notebook, while I've tried to reproduce the same steps with my dataset, by using the models/research/object_detection/model_main_tf2.py, unluckly for me, the training script always ends before it started to iterate. It didn't show any Python Error and also it show some warnings as usual. The complete output is in my Colab Notebook
我正在使用以下命令微调模型。
PIPELINE_CONFIG_PATH=models/ssd_resnet152_v1_fpn_640x640_coco17_tpu-8/pipeline.config; MODEL_DIR=training/; NUM_TRAIN_STEPS=10000; SAMPLE_1_OF_N_EVAL_EXAMPLES=1;
python models/research/object_detection/model_main_tf2.py --model_dir=$MODEL_DIR --num_train_steps=$NUM_TRAIN_STEPS --sample_1_of_n_eval_examples=$SAMPLE_1_OF_N_EVAL_EXAMPLES --pipeline_config_path=$PIPELINE_CONFIG_PATH --alsologtostderr
这是我的 pipeline.config 文件
model {
ssd {
num_classes: 90
image_resizer {
fixed_shape_resizer {
height: 640
width: 640
}
}
feature_extractor {
type: "ssd_resnet152_v1_fpn_keras"
depth_multiplier: 1.0
min_depth: 16
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 0.00039999998989515007
}
}
initializer {
truncated_normal_initializer {
mean: 0.0
stddev: 0.029999999329447746
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
override_base_feature_extractor_hyperparams: true
fpn {
min_level: 3
max_level: 7
}
}
box_coder {
faster_rcnn_box_coder {
y_scale: 10.0
x_scale: 10.0
height_scale: 5.0
width_scale: 5.0
}
}
matcher {
argmax_matcher {
matched_threshold: 0.5
unmatched_threshold: 0.5
ignore_thresholds: false
negatives_lower_than_unmatched: true
force_match_for_each_row: true
use_matmul_gather: true
}
}
similarity_calculator {
iou_similarity {
}
}
box_predictor {
weight_shared_convolutional_box_predictor {
conv_hyperparams {
regularizer {
l2_regularizer {
weight: 0.00039999998989515007
}
}
initializer {
random_normal_initializer {
mean: 0.0
stddev: 0.009999999776482582
}
}
activation: RELU_6
batch_norm {
decay: 0.996999979019165
scale: true
epsilon: 0.0010000000474974513
}
}
depth: 256
num_layers_before_predictor: 4
kernel_size: 3
class_prediction_bias_init: -4.599999904632568
}
}
anchor_generator {
multiscale_anchor_generator {
min_level: 3
max_level: 7
anchor_scale: 4.0
aspect_ratios: 1.0
aspect_ratios: 2.0
aspect_ratios: 0.5
scales_per_octave: 2
}
}
post_processing {
batch_non_max_suppression {
score_threshold: 9.99999993922529e-09
iou_threshold: 0.6000000238418579
max_detections_per_class: 100
max_total_detections: 100
use_static_shapes: false
}
score_converter: SIGMOID
}
normalize_loss_by_num_matches: true
loss {
localization_loss {
weighted_smooth_l1 {
}
}
classification_loss {
weighted_sigmoid_focal {
gamma: 2.0
alpha: 0.25
}
}
classification_weight: 1.0
localization_weight: 1.0
}
encode_background_as_zeros: true
normalize_loc_loss_by_codesize: true
inplace_batchnorm_update: true
freeze_batchnorm: false
}
}
train_config {
batch_size: 8
data_augmentation_options {
random_horizontal_flip {
}
}
data_augmentation_options {
random_crop_image {
min_object_covered: 0.0
min_aspect_ratio: 0.75
max_aspect_ratio: 3.0
min_area: 0.75
max_area: 1.0
overlap_thresh: 0.0
}
}
sync_replicas: true
optimizer {
momentum_optimizer {
learning_rate {
cosine_decay_learning_rate {
learning_rate_base: 0.03999999910593033
total_steps: 25000
warmup_learning_rate: 0.013333000242710114
warmup_steps: 2000
}
}
momentum_optimizer_value: 0.8999999761581421
}
use_moving_average: false
}
fine_tune_checkpoint_version: V2
fine_tune_checkpoint: "models/ssd_resnet152_v1_fpn_640x640_coco17_tpu-8/checkpoint/ckpt-0"
num_steps: 25000
startup_delay_steps: 0.0
replicas_to_aggregate: 8
max_number_of_boxes: 100
unpad_groundtruth_tensors: false
fine_tune_checkpoint_type: "classification"
use_bfloat16: true
}
train_input_reader {
label_map_path: "datasets/UrbanTracker/urban_tracker_label_map.pbtxt"
tf_record_input_reader {
input_path: "datasets/UrbanTracker/urban_tracker_train.record"
}
}
eval_config {
metrics_set: "coco_detection_metrics"
use_moving_averages: false
}
eval_input_reader {
label_map_path: "datasets/UrbanTracker/urban_tracker_label_map.pbtxt"
shuffle: false
num_epochs: 1
tf_record_input_reader {
input_path: "datasets/UrbanTracker/urban_tracker_test.record"
}
}
这就是我的模型目录的样子。
.
├── datasets
│ ├── raccoon
│ │ ├── raccoon_label_map.pbtxt
│ │ ├── raccoon_test.record
│ │ └── raccoon_train.record
│ ├── readme.md
│ └── UrbanTracker
│ ├── labels_urbantracker.txt
│ ├── urban_tracker_label_map.pbtxt
│ ├── urban_tracker_test.record
│ └── urban_tracker_train.record
├── __main__.py
├── models
│ ├── AUTHORS
│ ├── efficientdet_d1_coco17_tpu-32
│ │ ├── checkpoint
│ │ │ ├── checkpoint
│ │ │ ├── ckpt-0.data-00000-of-00001
│ │ │ └── ckpt-0.index
│ │ ├── pipeline.config
│ │ ├── saved_model
│ │ │ ├── assets
│ │ │ ├── saved_model.pb
│ │ │ └── variables
│ │ │ ├── variables.data-00000-of-00001
│ │ │ └── variables.index
│ ├── faster_rcnn_resnet101_v1_640x640_coco17_tpu-8
│ │ ├── checkpoint
│ │ │ ├── checkpoint
│ │ │ ├── ckpt-0.data-00000-of-00001
│ │ │ └── ckpt-0.index
│ │ ├── pipeline.config
│ │ ├── saved_model
│ │ │ ├── saved_model.pb
│ │ │ └── variables
│ │ │ ├── variables.data-00000-of-00001
│ │ │ └── variables.index
│ └── ssd_resnet152_v1_fpn_640x640_coco17_tpu-8
│ ├── checkpoint
│ │ ├── checkpoint
│ │ ├── ckpt-0.data-00000-of-00001
│ │ └── ckpt-0.index
│ ├── pipeline.config
│ ├── saved_model
│ │ ├── assets
│ │ ├── saved_model.pb
│ │ └── variables
│ │ ├── variables.data-00000-of-00001
│ │ └── variables.index
├── tools
│ ├── parse_polytrack.py
│ ├── polytrack_csv_to_tfrecord.py
│ ├── raccoon_labels_test.csv
│ ├── raccoon_labels_train.csv
│ ├── split_dataset.py
│ ├── urban_tracker_test.csv
│ └── urban_tracker_train.csv
我已经使用 TF v1 和 v2 API 将我的数据集转换为 TFRecord。另外,我一直在玩不同的训练参数,但运气不佳。为了检查我的数据集,以防我错误地生成它,我尝试了另一个数据集,基本的 Raccoon Dataset 但我得到了相同的结果。
感谢您的关注。
已解决:对于 efficientdet_d1_coco17_tpu-32 等模型,只需更改 pipeline.config 中的参数 fine_tune_checkpoint_type: "classification"
到 fine_tune_checkpoint_type: "detection"
,检查 TF Github