pytorch经onnx转tensorrt初体验上、下中学习了tensorrt如何调用onnx模型,但其中遇到的问题是tensorrt7没有办法直接输入动态batchsize的数据,当batchsize>1时只有第一个sample的结果是正确的,而其后的samples的输出都为0. 本文主要是探索如何进行批量化的处理。
1. 添加辅助引擎。
这是TensorRT/samples/sampleDynamicReshape/sampleDynamicReshape.cpp
中给出的一个解决方案,其主要思路是在原有的INetwork 之前再创建一个用于input resize的Network, 该Network的主要功能是对可变的输入进行resize,以及设置配置文件和参数绑定。
其中,最主要的部分如下:
// Finally, configure and build the preprocessor engine.
auto preprocessorConfig = makeUnique(builder->createBuilderConfig());
// Create an optimization profile so that we can specify a range of input dimensions.
auto profile = builder->createOptimizationProfile();
// This profile will be valid for all images whose size falls in the range of [(1, 1, 1, 1), (1, 1, 56, 56)]
// but TensorRT will optimize for (1, 1, 28, 28)
profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims4{1, 1, 1, 1});
profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims4{1, 1, 28, 28});
profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims4{1, 1, 56, 56});
preprocessorConfig->addOptimizationProfile(profile);
mPreprocessorEngine = makeUnique(builder->buildEngineWithConfig(*preprocessorNetwork, *preprocessorConfig));
其中配置器profile
指定了输入的最小尺寸、最优尺寸和最大尺寸。那么真实输入时,处在最小和最大尺寸中都行。
2. 直接给context 配置profile文件。
参考:# TensorRT 7 ONNX models with variable batch size
另一个比较简洁的使用Python API的: 示例
- 安装文档,首先我们需要生成一个onnx模型,我们这里生成两个模型,分别对应batchsize固定
resnet18.onnx
和batchsize可变resnet18_dynamic.onnx
#--*-- coding:utf-8 --*--
import onnx
import torch
import torchvision
import netron
net = torchvision.models.resnet18(pretrained=True).cuda()
net.eval()
export_onnx_file = "./resnet18.onnx"
x=torch.onnx.export(net, # 待转换的网络模型和参数
torch.randn(1, 3, 224, 224, device='cuda'), # 虚拟的输入,用于确定输入尺寸和推理计算图每个节点的尺寸
export_onnx_file, # 输出文件的名称
verbose=False, # 是否以字符串的形式显示计算图
input_names=["input"],# + ["params_%d"%i for i in range(120)], # 输入节点的名称,这里也可以给一个list,list中名称分别对应每一层可学习的参数,便于后续查询
output_names=["output"], # 输出节点的名称
opset_version=10, # onnx 支持采用的operator set, 应该和pytorch版本相关,目前我这里最高支持10
do_constant_folding=True, # 是否压缩常量
)
export_onnx_file = "./resnet18_dynamic.onnx"
x=torch.onnx.export(net, # 待转换的网络模型和参数
torch.randn(1, 3, 224, 224, device='cuda'), # 虚拟的输入,用于确定输入尺寸和推理计算图每个节点的尺寸
export_onnx_file, # 输出文件的名称
verbose=False, # 是否以字符串的形式显示计算图
input_names=["input"],# + ["params_%d"%i for i in range(120)], # 输入节点的名称,这里也可以给一个list,list中名称分别对应每一层可学习的参数,便于后续查询
output_names=["output"], # 输出节点的名称
opset_version=10, # onnx 支持采用的operator set, 应该和pytorch版本相关,目前我这里最高支持10
do_constant_folding=True, # 是否压缩常量
dynamic_axes={"input":{0: "batch_size"}, "output":{0: "batch_size"},} #设置动态维度,此处指明input节点的第0维度可变,命名为batch_size
)
- 然后我们使用 TensorRT 提供的
trtexec
工具由onnx模型直接生成并保存cuda引擎。
trtexec
指令的位置:<path-to-TensorRT>/bin
, 所以把该路径添加到PATH
环境变量中
export PATH=/home/zwzhou/packages/TensorRT-7.0.0.11/bin:$PATH
测试 trtexec -h
发现
查看cuda版本
nvcc -V
以及对应PATH变量 echo $PATH
均未发现异常。然后输出
echo $LD_LIBRARY_PATH
发现:发现LD_LIBRARY_PATH的第一个cuda路径对应着 cuda9.2, 这是因为之前root环境安装的是cuda环境,所以此时需要将自己安装的cuda版本放在第一个搜索的路径上:
export LD_LIBRARY_PATH=/home/zwzhou/cuda-9.0/bin:${LD_LIBRARY_PATH}
再次 trtexec -h
即正确显示帮助信息。其中给出了 model options、build options、 inference options和system options等。
a. 从固定尺寸的onnx转cudaEngine
···
trtexec --explicitBatch --onnx=./resnet18.onnx --saveEngine=resnet18.engine
···
b.从可变尺寸的onnx转cudaEngine,需要指定profile。
trtexec --onnx=./resnet18_dynamic.onnx --explicitBatch \
--minShapes="input":1x3x224x224\
--optShapes="input":16x3x224x224\
--maxShapes="input":32x3x224x224\
--shapes="input":1x3x224x224\
--saveEngine=resnet18_dynamic.engine
c. 接下来看一下python API的调用
import argparse
from typing import Tuple, List
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
BatchSize = 32
// 判断 shape是否是动态的或者固定的
def is_fixed(shape: Tuple[int]):
return not is_dynamic(shape)
def is_dynamic(shape: Tuple[int]):
return any(dim is None or dim < 0 for dim in shape)
def load_engine(filename: str):
# Load serialized engine file into memory 加载序列化的cuda引擎并进行反序列化
with open(filename, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def get_binding_idxs(engine: trt.ICudaEngine, profile_index: int):
# Calculate start/end binding indices for current context's profile
num_bindings_per_profile = engine.num_bindings // engine.num_optimization_profiles
start_binding = profile_index * num_bindings_per_profile
end_binding = start_binding + num_bindings_per_profile
print("Engine/Binding Metadata")
print("\tNumber of optimization profiles: {}".format(engine.num_optimization_profiles))
print("\tNumber of bindings per profile: {}".format(num_bindings_per_profile))
print("\tFirst binding for profile {}: {}".format(profile_index, start_binding))
print("\tLast binding for profile {}: {}".format(profile_index, end_binding-1))
# Separate input and output binding indices for convenience
input_binding_idxs = []
output_binding_idxs = []
for binding_index in range(start_binding, end_binding):
if engine.binding_is_input(binding_index):
input_binding_idxs.append(binding_index)
else:
output_binding_idxs.append(binding_index)
return input_binding_idxs, output_binding_idxs
# 指定输入的shape,同时根据输入的shape指定输出的shape,并未输出赋予cuda空间
def setup_binding_shapes(
engine: trt.ICudaEngine,
context: trt.IExecutionContext,
host_inputs: List[np.ndarray],
input_binding_idxs: List[int],
output_binding_idxs: List[int],
):
# Explicitly set the dynamic input shapes, so the dynamic output
# shapes can be computed internally
for host_input, binding_index in zip(host_inputs, input_binding_idxs):
context.set_binding_shape(binding_index, host_input.shape)
assert context.all_binding_shapes_specified
host_outputs = []
device_outputs = []
for binding_index in output_binding_idxs:
output_shape = context.get_binding_shape(binding_index)
# Allocate buffers to hold output results after copying back to host
buffer = np.empty(output_shape, dtype=np.float32)
host_outputs.append(buffer)
# Allocate output buffers on device
device_outputs.append(cuda.mem_alloc(buffer.nbytes))
return host_outputs, device_outputs
def get_random_inputs(
engine: trt.ICudaEngine,
context: trt.IExecutionContext,
input_binding_idxs: List[int],
seed: int = 42,
):
# Input data for inference
host_inputs = []
print("Generating Random Inputs")
print("\tUsing random seed: {}".format(seed))
np.random.seed(seed)
for binding_index in input_binding_idxs:
# If input shape is fixed, we'll just use it
input_shape = context.get_binding_shape(binding_index)
input_name = engine.get_binding_name(binding_index)
print("\tInput [{}] shape: {}".format(input_name, input_shape))
# If input shape is dynamic, we'll arbitrarily select one of the
# the min/opt/max shapes from our optimization profile
if is_dynamic(input_shape):
profile_index = context.active_optimization_profile
profile_shapes = engine.get_profile_shape(profile_index, binding_index)
print("\tProfile Shapes for [{}]: [kMIN {} | kOPT {} | kMAX {}]".format(input_name, *profile_shapes))
# 0=min, 1=opt, 2=max, or choose any shape, (min <= shape <= max)
input_shape = (BatchSize, 3, 224, 224)#profile_shapes[1]
print("\tInput [{}] shape was dynamic, setting inference shape to {}".format(input_name, input_shape))
host_inputs.append(np.random.random(input_shape).astype(np.float32))
return host_inputs
主函数:
def main():
parser = argparse.ArgumentParser()
parser.add_argument("-e", "--engine", required=True, type=str,
help="Path to TensorRT engine file.")
parser.add_argument("-s", "--seed", type=int, default=42,
help="Random seed for reproducibility.")
args = parser.parse_args()
# Load a serialized engine into memory
engine = load_engine(args.engine) // 加载序列化的cuda引擎
print("Loaded engine: {}".format(args.engine))
# Create context, this can be re-used 创建 执行环境
context = engine.create_execution_context()
# Profile 0 (first profile) is used by default context可以设置多个profile, 这里选择第一个,也是默认的profile,其中规定了输入尺寸的变化区间
context.active_optimization_profile = 0
print("Active Optimization Profile: {}".format(context.active_optimization_profile))
# These binding_idxs can change if either the context or the
# active_optimization_profile are changed 获得输入输出变量名对应profile的idx
input_binding_idxs, output_binding_idxs = get_binding_idxs(
engine, context.active_optimization_profile
)
# 获得输入变量的变量名
input_names = [engine.get_binding_name(binding_idx) for binding_idx in input_binding_idxs]
# Generate random inputs based on profile shapes, 随机产生输入变量
host_inputs = get_random_inputs(engine, context, input_binding_idxs, seed=args.seed)
# Allocate device memory for inputs. This can be easily re-used if the
# input shapes don't change 为输入变量赋予host空间,该空间可复用
device_inputs = [cuda.mem_alloc(h_input.nbytes) for h_input in host_inputs]
# Copy host inputs to device, this needs to be done for each new input, 由host拷贝到device
for h_input, d_input in zip(host_inputs, device_inputs):
cuda.memcpy_htod(d_input, h_input)
print("Input Metadata")
print("\tNumber of Inputs: {}".format(len(input_binding_idxs)))
print("\tInput Bindings for Profile {}: {}".format(context.active_optimization_profile, input_binding_idxs))
print("\tInput names: {}".format(input_names))
print("\tInput shapes: {}".format([inp.shape for inp in host_inputs]))
# This needs to be called everytime your input shapes change
# If your inputs are always the same shape (same batch size, etc.),
# then you will only need to call this once 重新指定网络输入输出的大小。
host_outputs, device_outputs = setup_binding_shapes(
engine, context, host_inputs, input_binding_idxs, output_binding_idxs,
) # 返回的是输出的idx和device buffer
output_names = [engine.get_binding_name(binding_idx) for binding_idx in output_binding_idxs]
print("Output Metadata")
print("\tNumber of Outputs: {}".format(len(output_binding_idxs)))
print("\tOutput names: {}".format(output_names))
print("\tOutput shapes: {}".format([out.shape for out in host_outputs]))
print("\tOutput Bindings for Profile {}: {}".format(context.active_optimization_profile, output_binding_idxs))
# Bindings are a list of device pointers for inputs and outputs
bindings = device_inputs + device_outputs # list的合并
# Inference
t1 = time.time()
for i in range(1000):
context.execute_v2(bindings) // 执行1000次, 该处和execute_async_v2函数不大一样
t2 = time.time()
print("Inference iterations: {}".format(((t2-t1))))
print("Inference iterations per sample: {}".format(((t2-t1)/BatchSize)))
# Copy outputs back to host to view results 将输出由gpu拷贝到cpu。
for h_output, d_output in zip(host_outputs, device_outputs):
cuda.memcpy_dtoh(h_output, d_output)
# View outputs
# print("Inference Outputs:", host_outputs)
# Cleanup (Can also use context managers instead)
del context
del engine
下表给出的是V100上resnet18的前向推断时间(ms)
bs | 1 | 2 | 4 | 6 | 8 | 12 | 16 | 24 | 32 |
---|---|---|---|---|---|---|---|---|---|
all time | 1.57 | 1.67 | 1.78 | 2.67 | 2.81 | 3.83 | 4.80 | 6.43 | 8.65 |
avg time | 1.57 | 0.84 | 0.45 | 0.44 | 0.35 | 0.32 | 0.30 | 0.27 | 0.27 |
总结一下,dynamic batchsize的处理流程:
- 生成 batch可变化的onnx,当然这一步非必须,可以后面tensorrt中修改
- 将onnx模型保存成 engine文件,可以使用trtexec工具
- 输入在profile限定尺寸范围内的数据,并分配host和device空间
- 根据输入的尺寸,推到输出变量的尺寸,并分配host和device空间
- execute_v2进行推理
- 将输出由cuda拷贝到cpu进行处理