无法在 tensorRT 中为 QAT 模型创建校准缓存
Cannot create the calibration cache for the QAT model in tensorRT
我训练了一个量化模型(借助 pytorch 中的量化感知训练方法)。我想创建校准缓存以通过 TensorRT 在 INT8 模式下进行推理。创建 calib 缓存时,我收到以下警告并且未创建缓存:
[03/06/2022-08:14:07] [TRT] [W] Calibrator won't be used in explicit precision mode. Use quantization aware training to generate network with Quantize/Dequantize nodes.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
我已经对模型进行了相应的训练并转换为 ONNX:
import os
import sys
import argparse
import warnings
import collections
import torch
import torch.utils.data
from torch import nn
from tqdm import tqdm
import torchvision
from torchvision import transforms
from torch.hub import load_state_dict_from_url
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import calib
from pytorch_quantization.tensor_quant import QuantDescriptor
from pytorch_quantization import quant_modules
import onnxruntime
import numpy as np
import models
import kornia
from prettytable import PrettyTable
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
def get_parser():
"""
Creates an argument parser.
"""
parser = argparse.ArgumentParser(description='Classification quantization flow script')
parser.add_argument('--data-dir', '-d', type=str, help='input data folder', required=True)
parser.add_argument('--model-name', '-m', default='', help='model name: default resnet50')
parser.add_argument('--disable-pcq', '-dpcq', action="store_true", help='disable per-channel quantization for weights')
parser.add_argument('--out-dir', '-o', default='/tmp', help='output folder: default /tmp')
parser.add_argument('--print-freq', '-pf', type=int, default=20, help='evaluation print frequency: default 20')
parser.add_argument('--threshold', '-t', type=float, default=-1.0, help='top1 accuracy threshold (less than 0.0 means no comparison): default -1.0')
parser.add_argument('--batch-size-train', type=int, default=8, help='batch size for training: default 128')
parser.add_argument('--batch-size-test', type=int, default=8, help='batch size for testing: default 128')
parser.add_argument('--batch-size-onnx', type=int, default=20, help='batch size for onnx: default 1')
parser.add_argument('--seed', type=int, default=12345, help='random seed: default 12345')
checkpoint = parser.add_mutually_exclusive_group(required=True)
checkpoint.add_argument('--ckpt-path', default='', type=str, required=False,
help='path to latest checkpoint (default: none)')
checkpoint.add_argument('--ckpt-url', default='', type=str, required=False,
help='url to latest checkpoint (default: none)')
checkpoint.add_argument('--pretrained', action="store_true")
parser.add_argument('--num-calib-batch', default=8, type=int,
help='Number of batches for calibration. 0 will disable calibration. (default: 4)')
parser.add_argument('--num-finetune-epochs', default=0, type=int,
help='Number of epochs to fine tune. 0 will disable fine tune. (default: 0)')
parser.add_argument('--calibrator', type=str, choices=["max", "histogram"], default="max")
parser.add_argument('--percentile', nargs='+', type=float, default=[99.9, 99.99, 99.999, 99.9999])
parser.add_argument('--sensitivity', action="store_true", help="Build sensitivity profile")
parser.add_argument('--evaluate-onnx', action="store_true", help="Evaluate exported ONNX")
return parser
def prepare_model(
model_name,
num_class,
data_dir,
per_channel_quantization,
batch_size_train,
batch_size_test,
batch_size_onnx,
calibrator,
pretrained,
ckpt_path,
ckpt_url=None):
## Initialize quantization, model and data loaders
if per_channel_quantization:
print('<<<<<<< Per channel qaunt >>>>>>>>')
quant_desc_input = QuantDescriptor(calib_method=calibrator)
quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
else:
## Force per tensor quantization for onnx runtime
print('<<<<<<< Per tensor qaunt >>>>>>>>')
quant_desc_input = QuantDescriptor(calib_method=calibrator, axis=None)
quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantConvTranspose2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
quant_desc_weight = QuantDescriptor(calib_method=calibrator, axis=None)
quant_nn.QuantConv2d.set_default_quant_desc_weight(quant_desc_weight)
quant_nn.QuantConvTranspose2d.set_default_quant_desc_weight(quant_desc_weight)
quant_nn.QuantLinear.set_default_quant_desc_weight(quant_desc_weight)
if model_name in models.__dict__:
model = models.__dict__[model_name](pretrained=pretrained, quantize=True)
num_feats = model.fc.in_features
model.fc = nn.Linear(num_feats, num_class)
else:
print('Model is not available....downloading....')
quant_modules.initialize()
model = torchvision.models.__dict__[model_name](pretrained=pretrained)
if 'resnet' in model_name:
num_feats = model.fc.in_features
model.fc = nn.Linear(num_feats, num_class)
if 'densenet' in model_name:
num_feats = model.classifier.in_features
model.classifier = nn.Linear(num_feats, num_class)
quant_modules.deactivate()
if not pretrained:
if ckpt_path:
model = torch.load(ckpt_path)
else:
model = load_state_dict_from_url(ckpt_url)
if 'state_dict' in model.keys():
model = model['state_dict']
elif 'model' in model.keys():
model = model['model']
# model.load_state_dict(checkpoint)
model.eval()
model.cuda()
print(model)
## Prepare the data loaders
traindir = os.path.join(data_dir, 'train')
valdir = os.path.join(data_dir, 'test')
_args = collections.namedtuple("mock_args", ["model", "distributed", "cache_dataset"])
dataset, dataset_test, train_sampler, test_sampler = load_data(
traindir, valdir, _args(model=model_name, distributed=False, cache_dataset=False))
data_loader_train = torch.utils.data.DataLoader(
dataset, batch_size=batch_size_train,
sampler=train_sampler, num_workers=4, pin_memory=True)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=batch_size_test,
sampler=test_sampler, num_workers=4, pin_memory=True)
data_loader_onnx = torch.utils.data.DataLoader(
dataset_test, batch_size=batch_size_onnx,
sampler=test_sampler, num_workers=4, pin_memory=True)
return model, data_loader_train, data_loader_test, data_loader_onnx
def main(cmdline_args):
parser = get_parser()
args = parser.parse_args(cmdline_args)
print(parser.description)
print(args)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
args.model_name = 'resnet34'
args.data_dir = '/home/Dataset/'
args.disable_pcq = True #set true to disbale
args.batch_size_train = 8
args.batch_size_test = 8
args.batch_size_onnx = 8
args.calibrator = 'max'
args.pretrained = True
args.ckpt_path = ''
args.ckpt_url = ''
args.num_class = 5
## Prepare the pretrained model and data loaders
model, data_loader_train, data_loader_test, data_loader_onnx = prepare_model(
args.model_name,
args.num_class,
args.data_dir,
not args.disable_pcq,
args.batch_size_train,
args.batch_size_test,
args.batch_size_onnx,
args.calibrator,
args.pretrained,
args.ckpt_path,
args.ckpt_url)
kwargs = {"alpha": 0.75, "gamma": 2.0, "reduction": 'mean'}
criterion = kornia.losses.FocalLoss(**kwargs)
## Calibrate the model
with torch.no_grad():
calibrate_model(
model=model,
model_name=args.model_name,
data_loader=data_loader_train,
num_calib_batch=args.num_calib_batch,
calibrator=args.calibrator,
hist_percentile=args.percentile,
out_dir=args.out_dir)
## Build sensitivy profile
if args.sensitivity:
build_sensitivity_profile(model, criterion, data_loader_test)
kwargs = {"alpha": 0.75, "gamma": 3.0, "reduction": 'mean'}
criterion = kornia.losses.FocalLoss(**kwargs)
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_finetune_epochs)
for epoch in range(args.num_finetune_epochs):
# Training a single epch
train_one_epoch(model, criterion, optimizer, data_loader_train, "cuda", epoch, 100)
lr_scheduler.step()
if args.num_finetune_epochs > 0:
## Evaluate after finetuning
with torch.no_grad():
print('Finetune evaluation:')
top1_finetuned = evaluate(model, criterion, data_loader_test, device="cuda")
else:
top1_finetuned = -1.0
## Export to ONNX
onnx_filename = args.out_dir + '/' + args.model_name + ".onnx"
top1_onnx = -1.0
if export_onnx(model, onnx_filename, args.batch_size_onnx, not args.disable_pcq) and args.evaluate_onnx:
## Validate ONNX and evaluate
top1_onnx = evaluate_onnx(onnx_filename, data_loader_onnx, criterion, args.print_freq)
## Print summary
print("Accuracy summary:")
table = PrettyTable(['Stage','Top1'])
table.align['Stage'] = "l"
table.add_row( [ 'Finetuned', "{:.2f}".format(top1_finetuned) ] )
table.add_row( [ 'ONNX', "{:.2f}".format(top1_onnx) ] )
print(table)
## Compare results
if args.threshold >= 0.0:
if args.evaluate_onnx and top1_onnx < 0.0:
print("Failed to export/evaluate ONNX!")
return 1
if args.num_finetune_epochs > 0:
if top1_finetuned >= (top1_onnx - args.threshold):
print("Accuracy threshold was met!")
else:
print("Accuracy threshold was missed!")
return 1
return 0
def evaluate_onnx(onnx_filename, data_loader, criterion, print_freq):
print("Loading ONNX file: ", onnx_filename)
ort_session = onnxruntime.InferenceSession(onnx_filename)
with torch.no_grad():
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
with torch.no_grad():
for image, target in metric_logger.log_every(data_loader, print_freq, header):
image = image.to("cpu", non_blocking=True)
image_data = np.array(image)
input_data = image_data
# run the data through onnx runtime instead of torch model
input_name = ort_session.get_inputs()[0].name
raw_result = ort_session.run([], {input_name: input_data})
output = torch.tensor((raw_result[0]))
loss = criterion(output, target)
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
batch_size = image.shape[0]
metric_logger.update(loss=loss.item())
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print(' ONNXRuntime: Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'
.format(top1=metric_logger.acc1, top5=metric_logger.acc5))
return metric_logger.acc1.global_avg
def export_onnx(model, onnx_filename, batch_onnx, per_channel_quantization):
model.eval()
quant_nn.TensorQuantizer.use_fb_fake_quant = True # We have to shift to pytorch's fake quant ops before exporting the model to ONNX
if per_channel_quantization:
opset_version = 13
else:
opset_version = 12
# Export ONNX for multiple batch sizes
print("Creating ONNX file: " + onnx_filename)
dummy_input = torch.randn(batch_onnx, 3, 224, 224, device='cuda') #TODO: switch input dims by model
input_names = ['input']
if 'resnet' in onnx_filename:
print('Changing last layer of resnet...')
output_names = ['Linear[fc]'] ### ResNet34
if 'densenet' in onnx_filename:
print('Changing last layer of densenet...')
output_names = ['Linear[classifier]'] #### DenseNet
dynamic_axes = {'input': {0: 'batch_size'}}
try:
torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names,
export_params=True, output_names=output_names, opset_version=opset_version,
dynamic_axes=dynamic_axes, verbose=True, enable_onnx_checker=False, do_constant_folding=True)
except ValueError:
warnings.warn(UserWarning("Per-channel quantization is not yet supported in Pytorch/ONNX RT (requires ONNX opset 13)"))
print("Failed to export to ONNX")
return False
return True
def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator, hist_percentile, out_dir):
if num_calib_batch > 0:
print("Calibrating model")
with torch.no_grad():
collect_stats(model, data_loader, num_calib_batch)
if not calibrator == "histogram":
compute_amax(model, method="max")
calib_output = os.path.join(
out_dir,
F"{model_name}-max-{num_calib_batch*data_loader.batch_size}.pth")
# torch.save(model.state_dict(), calib_output) # Just weights
torch.save(model, calib_output) # whole model
else:
for percentile in hist_percentile:
print(F"{percentile} percentile calibration")
compute_amax(model, method="percentile")
calib_output = os.path.join(
out_dir,
F"{model_name}-percentile-{percentile}-{num_calib_batch*data_loader.batch_size}.pth")
torch.save(model, calib_output) # whole model
for method in ["mse", "entropy"]:
print(F"{method} calibration")
compute_amax(model, method=method)
calib_output = os.path.join(
out_dir,
F"{model_name}-{method}-{num_calib_batch*data_loader.batch_size}.pth")
# torch.save(model.state_dict(), calib_output)
torch.save(model, calib_output)
def collect_stats(model, data_loader, num_batches):
# Enable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
# Feed data to the network for collecting stats
for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
model(image.cuda())
if i >= num_batches:
break
# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def compute_amax(model, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
print(F"{name:40}: {module}")
model.cuda()
def build_sensitivity_profile(model, criterion, data_loader_test):
quant_layer_names = []
for name, module in model.named_modules():
if name.endswith("_quantizer"):
module.disable()
layer_name = name.replace("._input_quantizer", "").replace("._weight_quantizer", "")
if layer_name not in quant_layer_names:
quant_layer_names.append(layer_name)
for i, quant_layer in enumerate(quant_layer_names):
print("Enable", quant_layer)
for name, module in model.named_modules():
if name.endswith("_quantizer") and quant_layer in name:
module.enable()
print(F"{name:40}: {module}")
with torch.no_grad():
evaluate(model, criterion, data_loader_test, device="cuda")
for name, module in model.named_modules():
if name.endswith("_quantizer") and quant_layer in name:
module.disable()
print(F"{name:40}: {module}")
if __name__ == '__main__':
res = main(sys.argv[1:])
exit(res)
有关系统的更多信息:
TensorRT == 8.2
Pytorch == 1.9.0+cu111
Torchvision == 0.10.0+cu111
ONNX == 1.9.0
ONNXRuntime == 1.8.1
pycuda == 2021
如果 ONNX 模型中有 Q/DQ 个节点,您可能不需要校准缓存,因为 Q/DQ 个节点中包含缩放和零点等量化参数。您可以在 OnnxRuntime (>= v1.9.0) 的 TensorRT 执行提供程序中直接 运行 Q/DQ ONNX 模型。
我训练了一个量化模型(借助 pytorch 中的量化感知训练方法)。我想创建校准缓存以通过 TensorRT 在 INT8 模式下进行推理。创建 calib 缓存时,我收到以下警告并且未创建缓存:
[03/06/2022-08:14:07] [TRT] [W] Calibrator won't be used in explicit precision mode. Use quantization aware training to generate network with Quantize/Dequantize nodes.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
[03/06/2022-08:14:11] [TRT] [W] Some weights are outside of int8_t range and will be clipped to int8_t range.
我已经对模型进行了相应的训练并转换为 ONNX:
import os
import sys
import argparse
import warnings
import collections
import torch
import torch.utils.data
from torch import nn
from tqdm import tqdm
import torchvision
from torchvision import transforms
from torch.hub import load_state_dict_from_url
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import calib
from pytorch_quantization.tensor_quant import QuantDescriptor
from pytorch_quantization import quant_modules
import onnxruntime
import numpy as np
import models
import kornia
from prettytable import PrettyTable
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
def get_parser():
"""
Creates an argument parser.
"""
parser = argparse.ArgumentParser(description='Classification quantization flow script')
parser.add_argument('--data-dir', '-d', type=str, help='input data folder', required=True)
parser.add_argument('--model-name', '-m', default='', help='model name: default resnet50')
parser.add_argument('--disable-pcq', '-dpcq', action="store_true", help='disable per-channel quantization for weights')
parser.add_argument('--out-dir', '-o', default='/tmp', help='output folder: default /tmp')
parser.add_argument('--print-freq', '-pf', type=int, default=20, help='evaluation print frequency: default 20')
parser.add_argument('--threshold', '-t', type=float, default=-1.0, help='top1 accuracy threshold (less than 0.0 means no comparison): default -1.0')
parser.add_argument('--batch-size-train', type=int, default=8, help='batch size for training: default 128')
parser.add_argument('--batch-size-test', type=int, default=8, help='batch size for testing: default 128')
parser.add_argument('--batch-size-onnx', type=int, default=20, help='batch size for onnx: default 1')
parser.add_argument('--seed', type=int, default=12345, help='random seed: default 12345')
checkpoint = parser.add_mutually_exclusive_group(required=True)
checkpoint.add_argument('--ckpt-path', default='', type=str, required=False,
help='path to latest checkpoint (default: none)')
checkpoint.add_argument('--ckpt-url', default='', type=str, required=False,
help='url to latest checkpoint (default: none)')
checkpoint.add_argument('--pretrained', action="store_true")
parser.add_argument('--num-calib-batch', default=8, type=int,
help='Number of batches for calibration. 0 will disable calibration. (default: 4)')
parser.add_argument('--num-finetune-epochs', default=0, type=int,
help='Number of epochs to fine tune. 0 will disable fine tune. (default: 0)')
parser.add_argument('--calibrator', type=str, choices=["max", "histogram"], default="max")
parser.add_argument('--percentile', nargs='+', type=float, default=[99.9, 99.99, 99.999, 99.9999])
parser.add_argument('--sensitivity', action="store_true", help="Build sensitivity profile")
parser.add_argument('--evaluate-onnx', action="store_true", help="Evaluate exported ONNX")
return parser
def prepare_model(
model_name,
num_class,
data_dir,
per_channel_quantization,
batch_size_train,
batch_size_test,
batch_size_onnx,
calibrator,
pretrained,
ckpt_path,
ckpt_url=None):
## Initialize quantization, model and data loaders
if per_channel_quantization:
print('<<<<<<< Per channel qaunt >>>>>>>>')
quant_desc_input = QuantDescriptor(calib_method=calibrator)
quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
else:
## Force per tensor quantization for onnx runtime
print('<<<<<<< Per tensor qaunt >>>>>>>>')
quant_desc_input = QuantDescriptor(calib_method=calibrator, axis=None)
quant_nn.QuantConv2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantConvTranspose2d.set_default_quant_desc_input(quant_desc_input)
quant_nn.QuantLinear.set_default_quant_desc_input(quant_desc_input)
quant_desc_weight = QuantDescriptor(calib_method=calibrator, axis=None)
quant_nn.QuantConv2d.set_default_quant_desc_weight(quant_desc_weight)
quant_nn.QuantConvTranspose2d.set_default_quant_desc_weight(quant_desc_weight)
quant_nn.QuantLinear.set_default_quant_desc_weight(quant_desc_weight)
if model_name in models.__dict__:
model = models.__dict__[model_name](pretrained=pretrained, quantize=True)
num_feats = model.fc.in_features
model.fc = nn.Linear(num_feats, num_class)
else:
print('Model is not available....downloading....')
quant_modules.initialize()
model = torchvision.models.__dict__[model_name](pretrained=pretrained)
if 'resnet' in model_name:
num_feats = model.fc.in_features
model.fc = nn.Linear(num_feats, num_class)
if 'densenet' in model_name:
num_feats = model.classifier.in_features
model.classifier = nn.Linear(num_feats, num_class)
quant_modules.deactivate()
if not pretrained:
if ckpt_path:
model = torch.load(ckpt_path)
else:
model = load_state_dict_from_url(ckpt_url)
if 'state_dict' in model.keys():
model = model['state_dict']
elif 'model' in model.keys():
model = model['model']
# model.load_state_dict(checkpoint)
model.eval()
model.cuda()
print(model)
## Prepare the data loaders
traindir = os.path.join(data_dir, 'train')
valdir = os.path.join(data_dir, 'test')
_args = collections.namedtuple("mock_args", ["model", "distributed", "cache_dataset"])
dataset, dataset_test, train_sampler, test_sampler = load_data(
traindir, valdir, _args(model=model_name, distributed=False, cache_dataset=False))
data_loader_train = torch.utils.data.DataLoader(
dataset, batch_size=batch_size_train,
sampler=train_sampler, num_workers=4, pin_memory=True)
data_loader_test = torch.utils.data.DataLoader(
dataset_test, batch_size=batch_size_test,
sampler=test_sampler, num_workers=4, pin_memory=True)
data_loader_onnx = torch.utils.data.DataLoader(
dataset_test, batch_size=batch_size_onnx,
sampler=test_sampler, num_workers=4, pin_memory=True)
return model, data_loader_train, data_loader_test, data_loader_onnx
def main(cmdline_args):
parser = get_parser()
args = parser.parse_args(cmdline_args)
print(parser.description)
print(args)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
args.model_name = 'resnet34'
args.data_dir = '/home/Dataset/'
args.disable_pcq = True #set true to disbale
args.batch_size_train = 8
args.batch_size_test = 8
args.batch_size_onnx = 8
args.calibrator = 'max'
args.pretrained = True
args.ckpt_path = ''
args.ckpt_url = ''
args.num_class = 5
## Prepare the pretrained model and data loaders
model, data_loader_train, data_loader_test, data_loader_onnx = prepare_model(
args.model_name,
args.num_class,
args.data_dir,
not args.disable_pcq,
args.batch_size_train,
args.batch_size_test,
args.batch_size_onnx,
args.calibrator,
args.pretrained,
args.ckpt_path,
args.ckpt_url)
kwargs = {"alpha": 0.75, "gamma": 2.0, "reduction": 'mean'}
criterion = kornia.losses.FocalLoss(**kwargs)
## Calibrate the model
with torch.no_grad():
calibrate_model(
model=model,
model_name=args.model_name,
data_loader=data_loader_train,
num_calib_batch=args.num_calib_batch,
calibrator=args.calibrator,
hist_percentile=args.percentile,
out_dir=args.out_dir)
## Build sensitivy profile
if args.sensitivity:
build_sensitivity_profile(model, criterion, data_loader_test)
kwargs = {"alpha": 0.75, "gamma": 3.0, "reduction": 'mean'}
criterion = kornia.losses.FocalLoss(**kwargs)
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.num_finetune_epochs)
for epoch in range(args.num_finetune_epochs):
# Training a single epch
train_one_epoch(model, criterion, optimizer, data_loader_train, "cuda", epoch, 100)
lr_scheduler.step()
if args.num_finetune_epochs > 0:
## Evaluate after finetuning
with torch.no_grad():
print('Finetune evaluation:')
top1_finetuned = evaluate(model, criterion, data_loader_test, device="cuda")
else:
top1_finetuned = -1.0
## Export to ONNX
onnx_filename = args.out_dir + '/' + args.model_name + ".onnx"
top1_onnx = -1.0
if export_onnx(model, onnx_filename, args.batch_size_onnx, not args.disable_pcq) and args.evaluate_onnx:
## Validate ONNX and evaluate
top1_onnx = evaluate_onnx(onnx_filename, data_loader_onnx, criterion, args.print_freq)
## Print summary
print("Accuracy summary:")
table = PrettyTable(['Stage','Top1'])
table.align['Stage'] = "l"
table.add_row( [ 'Finetuned', "{:.2f}".format(top1_finetuned) ] )
table.add_row( [ 'ONNX', "{:.2f}".format(top1_onnx) ] )
print(table)
## Compare results
if args.threshold >= 0.0:
if args.evaluate_onnx and top1_onnx < 0.0:
print("Failed to export/evaluate ONNX!")
return 1
if args.num_finetune_epochs > 0:
if top1_finetuned >= (top1_onnx - args.threshold):
print("Accuracy threshold was met!")
else:
print("Accuracy threshold was missed!")
return 1
return 0
def evaluate_onnx(onnx_filename, data_loader, criterion, print_freq):
print("Loading ONNX file: ", onnx_filename)
ort_session = onnxruntime.InferenceSession(onnx_filename)
with torch.no_grad():
metric_logger = utils.MetricLogger(delimiter=" ")
header = 'Test:'
with torch.no_grad():
for image, target in metric_logger.log_every(data_loader, print_freq, header):
image = image.to("cpu", non_blocking=True)
image_data = np.array(image)
input_data = image_data
# run the data through onnx runtime instead of torch model
input_name = ort_session.get_inputs()[0].name
raw_result = ort_session.run([], {input_name: input_data})
output = torch.tensor((raw_result[0]))
loss = criterion(output, target)
acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
batch_size = image.shape[0]
metric_logger.update(loss=loss.item())
metric_logger.meters['acc1'].update(acc1.item(), n=batch_size)
metric_logger.meters['acc5'].update(acc5.item(), n=batch_size)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print(' ONNXRuntime: Acc@1 {top1.global_avg:.3f} Acc@5 {top5.global_avg:.3f}'
.format(top1=metric_logger.acc1, top5=metric_logger.acc5))
return metric_logger.acc1.global_avg
def export_onnx(model, onnx_filename, batch_onnx, per_channel_quantization):
model.eval()
quant_nn.TensorQuantizer.use_fb_fake_quant = True # We have to shift to pytorch's fake quant ops before exporting the model to ONNX
if per_channel_quantization:
opset_version = 13
else:
opset_version = 12
# Export ONNX for multiple batch sizes
print("Creating ONNX file: " + onnx_filename)
dummy_input = torch.randn(batch_onnx, 3, 224, 224, device='cuda') #TODO: switch input dims by model
input_names = ['input']
if 'resnet' in onnx_filename:
print('Changing last layer of resnet...')
output_names = ['Linear[fc]'] ### ResNet34
if 'densenet' in onnx_filename:
print('Changing last layer of densenet...')
output_names = ['Linear[classifier]'] #### DenseNet
dynamic_axes = {'input': {0: 'batch_size'}}
try:
torch.onnx.export(model, dummy_input, onnx_filename, input_names=input_names,
export_params=True, output_names=output_names, opset_version=opset_version,
dynamic_axes=dynamic_axes, verbose=True, enable_onnx_checker=False, do_constant_folding=True)
except ValueError:
warnings.warn(UserWarning("Per-channel quantization is not yet supported in Pytorch/ONNX RT (requires ONNX opset 13)"))
print("Failed to export to ONNX")
return False
return True
def calibrate_model(model, model_name, data_loader, num_calib_batch, calibrator, hist_percentile, out_dir):
if num_calib_batch > 0:
print("Calibrating model")
with torch.no_grad():
collect_stats(model, data_loader, num_calib_batch)
if not calibrator == "histogram":
compute_amax(model, method="max")
calib_output = os.path.join(
out_dir,
F"{model_name}-max-{num_calib_batch*data_loader.batch_size}.pth")
# torch.save(model.state_dict(), calib_output) # Just weights
torch.save(model, calib_output) # whole model
else:
for percentile in hist_percentile:
print(F"{percentile} percentile calibration")
compute_amax(model, method="percentile")
calib_output = os.path.join(
out_dir,
F"{model_name}-percentile-{percentile}-{num_calib_batch*data_loader.batch_size}.pth")
torch.save(model, calib_output) # whole model
for method in ["mse", "entropy"]:
print(F"{method} calibration")
compute_amax(model, method=method)
calib_output = os.path.join(
out_dir,
F"{model_name}-{method}-{num_calib_batch*data_loader.batch_size}.pth")
# torch.save(model.state_dict(), calib_output)
torch.save(model, calib_output)
def collect_stats(model, data_loader, num_batches):
# Enable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
# Feed data to the network for collecting stats
for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
model(image.cuda())
if i >= num_batches:
break
# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def compute_amax(model, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
print(F"{name:40}: {module}")
model.cuda()
def build_sensitivity_profile(model, criterion, data_loader_test):
quant_layer_names = []
for name, module in model.named_modules():
if name.endswith("_quantizer"):
module.disable()
layer_name = name.replace("._input_quantizer", "").replace("._weight_quantizer", "")
if layer_name not in quant_layer_names:
quant_layer_names.append(layer_name)
for i, quant_layer in enumerate(quant_layer_names):
print("Enable", quant_layer)
for name, module in model.named_modules():
if name.endswith("_quantizer") and quant_layer in name:
module.enable()
print(F"{name:40}: {module}")
with torch.no_grad():
evaluate(model, criterion, data_loader_test, device="cuda")
for name, module in model.named_modules():
if name.endswith("_quantizer") and quant_layer in name:
module.disable()
print(F"{name:40}: {module}")
if __name__ == '__main__':
res = main(sys.argv[1:])
exit(res)
有关系统的更多信息:
TensorRT == 8.2
Pytorch == 1.9.0+cu111
Torchvision == 0.10.0+cu111
ONNX == 1.9.0
ONNXRuntime == 1.8.1
pycuda == 2021
如果 ONNX 模型中有 Q/DQ 个节点,您可能不需要校准缓存,因为 Q/DQ 个节点中包含缩放和零点等量化参数。您可以在 OnnxRuntime (>= v1.9.0) 的 TensorRT 执行提供程序中直接 运行 Q/DQ ONNX 模型。