想使用libtorch建一个推理服务器,前端web准备使用rust+axum,后端推理准备尝试c++来做:
前言:
我所有的操作都使用vmware
中的虚拟机ubuntu 20.04
完成。有如下几个方便的点:
- 性能没啥问题
- 可以建立快照保存,如果怀疑下面的操作有问题,可以提前建立快照出问题了可以及时还原回来。
- 主力机是
amd win10
笔记本,直接装ubuntu
双系统,显卡驱动不正常,亮度不能调节等毛病。
正文:
一、libtorch&opencv4&grpc编译
1.1 grpc编译
官方编译指南
注意:强烈建议不要往系统里面安装,最好是指定一个安装目录
- 先指定环境变量 (必须):
export MY_INSTALL_DIR=$HOME/infer/installed_lib/grpc_lib # 这部分是自己指定的
mkdir -p $MY_INSTALL_DIR
export PATH="$MY_INSTALL_DIR/bin:$PATH"
- 确认cmake是否安装好:
sudo apt-get install cmake
- 确认常用的包是否安装好:
sudo apt install -y build-essential autoconf libtool pkg-config
- 下载GRPC:
git clone --recurse-submodules -b v1.41.0 https://github.com/grpc/grpc
# 这里下载好所有repo过后确认这里的grpc,如果后面有依赖grpc里面的protobuf不一样导致链接问题,把这里的复制过去再重新编译
- 正式编译:
$ cd grpc
$ mkdir -p cmake/build
$ pushd cmake/build # 将编译目录加入堆栈中
$ cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
-DCMAKE_INSTALL_PREFIX=$MY_INSTALL_DIR \
../..
$ make -j
$ make install
$ popd
- 测试example
$ cd examples/cpp/helloworld
$ mkdir -p cmake/build
$ pushd cmake/build
$ cmake -DCMAKE_PREFIX_PATH=$MY_INSTALL_DIR ../..
$ make -j
# 分两个终端运行
./greeter_server
#
./greeter_client
1.2 libtorch编译
- 下载源码
git clone --recursive https://github.com/pytorch/pytorch
cd pytorch
git submodule sync
git submodule update --init --recursive --jobs 0
- protobuf的一致性(重复依赖之前导致了问题,这里硬性重新整)
cd third_party/protobuf
# 比对grpc依赖的protobuf版本,下面是根据 在 grpc/protobuf 使用 git branch --v 得到
git reset --hard d1eca4e4b
- 创建conda环境
# 因为我只编译cpu版本的,这里没有考虑其他的附加cuda
# anaconda 可以直接使用tsinghua的版本,也可以考虑换源:https://blog.csdn.net/jasonzhoujx/article/details/81130109
conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools cffi typing_extensions future six requests dataclasses
- 开始编译
# 开始编译 CMAKE_INSTALL_PREFIX=~/infer/installed_lib/torch_lib 这一句指定自己的话
mkdir release
cd release
cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=~/infer/installed_lib/torch_lib -D BUILD_CAFFE2_MOBILE=OFF -D BUILD_PYTHON=OFF -D BUILD_CAFFE2_OPS=OFF -D BUILD_TEST=OFF -D USE_CUDA=OFF -D USE_CUDNN=OFF -D USE_OPENCV=ON -D USE_TBB=OFF USE_NNPACK=OFF ..
make -j${nproc}
make install
- 测试:repo: https://github.com/pytorch/examples
$ git clone https://github.com/pytorch/examples.git
$ cd example/cpp/
$ cd mnist
$ mkdir build
$ cd build
$ cmake -DCMAKE_PREFIX_PATH=~/infer/installed_lib/torch_lib ..
$ make
1.3 opencv4 + contrib编译
- 先下载源码
# contrib
https://github.com/opencv/opencv_contrib.git
# opencv
wget https://github.com/opencv/opencv/archive/4.5.4.zip
- 安装依赖(可以先不装,如果运行的时候发现缺库再安装也可以)
sudo apt install build-essential
sudo apt install cmake git libgtk2.0-dev pkg-config libavcodec-dev libavformat-dev libswscale-dev
sudo apt install python-dev python-numpy libtbb2 libtbb-dev libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev
如果找不到libjasper-dev
:
sudo add-apt-repository "deb http://security.ubuntu.com/ubuntu xenial-security main"
sudo apt update
sudo apt upgrade
sudo apt install libjasper1 libjasper-dev
- 编译
mkdir build
cd build
cmake -DBUILD_TIFF=OFF -DENABLE_PRECOMPILED_HEADERS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_PREFIX=~/infer/installed_lib/opencv_lib -DOPENCV_EXTRA_MODULES_PATH=~/infer/opencv/opencv_contrib/modules -DBUILD_opencv_legacy=OFF ..
make -j${nproc}
make install
这里我编译的时候发现需要内存还是蛮多的,就退了点程序重新编译。
- 测试
cd build/bin
# 里面全是测试程序,可以通过
ldd opencv_version # 发现已经可以依赖库了
# 重新编译
cd samples/cpp/example_cmake
mkdir build
cd build
cmake -DCMAKE_PREFIX_PATH=~/infer/installed_lib/opencv_lib ..
make
./opencv_example
- 补充:根据运行时候出现的错误,再决定安装哪个库
1.4 总结:
上述的编译都没有将库直接安装到系统中,因为这样子如果要换库,将来会非常的麻烦。
(build_torch) jet@jet-vm:~/infer/installed_lib$ tree -L 2
.
├── grpc_lib
│ ├── bin
│ ├── include
│ ├── lib
│ └── share
├── opencv_lib
│ ├── bin
│ ├── include
│ ├── lib
│ └── share
└── torch_lib
├── bin
├── include
├── lib
└── share
15 directories, 0 files
二、创建demo程序
给定的目录如下(目录可能不是那么合理,等我学好了再仔细修改):
jet@jet-vm:~/infer$ tree -L 3
.
├── CMakeLists.txt
├── installed_lib
│ ├── grpc_lib
│ │ ├── bin
│ │ ├── include
│ │ ├── lib
│ │ └── share
│ ├── opencv_lib
│ │ ├── bin
│ │ ├── include
│ │ ├── lib
│ │ └── share
│ └── torch_lib
│ ├── bin
│ ├── include
│ ├── lib
│ └── share
├── protos
│ └── infer.proto
└── src
└── main.cpp
18 directories, 3 files
通过infer.proto
生成grpc
相关文件(在上面的infer.proto
目录中执行):
注意:这里我没有使用通过apt-get
安装的protoc
主要是怕版本不一致带来的各种问题!
cd protos
$ ~/infer/installed_lib/grpc_lib/bin/protoc --grpc_out=. --cpp_out=. --plugin=protoc-gen-grpc="/home/jet/infer/installed_lib/grpc_lib/bin/grpc_cpp_plugin" infer.proto
正常执行后在protos
目录中多了几个文件,变成下面这个结构:
protos
│ ├── infer.grpc.pb.cc
│ ├── infer.grpc.pb.h
│ ├── infer.pb.cc
│ ├── infer.pb.h
│ └── infer.proto
需要联调包含三个库的程序:CMakeLists.txt
内容如下:
cmake_minimum_required(VERSION 3.5.1)
project(InferServer C CXX)
# 添加寻找三个库地址
set(CMAKE_PREFIX_PATH
${PROJECT_SOURCE_DIR}/installed_lib/grpc_lib;
${PROJECT_SOURCE_DIR}/installed_lib/torch_lib;
${PROJECT_SOURCE_DIR}/installed_lib/opencv_lib;)
set(hw_proto_srcs "${PROJECT_SOURCE_DIR}/protos/infer.pb.cc")
set(hw_proto_hdrs "${PROJECT_SOURCE_DIR}/protos/infer.pb.h")
set(hw_grpc_srcs "${PROJECT_SOURCE_DIR}/protos/infer.grpc.pb.cc")
set(hw_grpc_hdrs "${PROJECT_SOURCE_DIR}/protos/infer.grpc.pb.h")
find_package(Protobuf CONFIG REQUIRED)
find_package(gRPC CONFIG REQUIRED)
find_package(Torch REQUIRED)
find_package(OpenCV REQUIRED)
set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
set(_REFLECTION gRPC::grpc++_reflection)
set(_GRPC_GRPCPP gRPC::grpc++)
add_library(hw_grpc_proto
${hw_grpc_srcs}
${hw_grpc_hdrs}
${hw_proto_srcs}
${hw_proto_hdrs}
)
target_link_libraries(hw_grpc_proto
${_REFLECTION}
${_GRPC_GRPCPP}
${_PROTOBUF_LIBPROTOBUF}
${TORCH_LIBRARIES}
${OpenCV_LIBS})
foreach(_target
main )
add_executable(${_target} "src/${_target}.cpp")
target_link_libraries(${_target}
hw_grpc_proto
${_REFLECTION}
${_GRPC_GRPCPP}
${_PROTOBUF_LIBPROTOBUF})
endforeach()
infer.proto
文件:
syntax = "proto3";
package infer;
service Infer {
rpc Infer (InferRequest) returns (InferReply) {}
}
message InferRequest {
bytes image = 1;
}
message Pred {
string name = 1;
float probability = 2;
}
message InferReply {
repeated Pred preds = 1;
}
引入头文件,再尝试调用一点库中的代码(这里我就没写了,担心的可以自己写几句),尝试编译,能通过的话环境就基本配置好了,其他就是写逻辑了。
main.cpp
:
// c++
#include <iostream>
// torch
#include <torch/torch.h>
#include <torch/script.h>
// opencv
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgcodecs.hpp>
// grpc
#include <grpcpp/grpcpp.h>
#include "../protos/infer.grpc.pb.h"
#include "../protos/infer.pb.h"
using namespace std;
int main(){
return 0;
}
编译&运行:
$ mkdir build
$ cd build
$ cmake ..
$ make
$ ./main
三、demo程序从开发机挪到运行机
确保程序在开发机运行正常过后,现在想要挪到另外一个机器中运行,最推荐的还是在运行机中重新编译一下,主要是我偷懒不想再去编译库了:
3.1 迁移到ubuntu18.04
-
ldd main
命令查看依赖库
(base) jet@jet-vm:~/infer/build$ ldd main
linux-vdso.so.1 (0x00007fff59cdc000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fcaa7c6a000)
libtorch_cpu.so => /home/jet/infer/installed_lib/torch_lib/lib/libtorch_cpu.so (0x00007fca9eb7c000)
libc10.so => /home/jet/infer/installed_lib/torch_lib/lib/libc10.so (0x00007fca9eb0d000)
libtorch.so => /home/jet/infer/installed_lib/torch_lib/lib/libtorch.so (0x00007fca9eb08000)
libopencv_imgcodecs.so.4.5 => /home/jet/infer/installed_lib/opencv_lib/lib/libopencv_imgcodecs.so.4.5 (0x00007fca9e750000)
libopencv_imgproc.so.4.5 => /home/jet/infer/installed_lib/opencv_lib/lib/libopencv_imgproc.so.4.5 (0x00007fca9cbe5000)
libopencv_core.so.4.5 => /home/jet/infer/installed_lib/opencv_lib/lib/libopencv_core.so.4.5 (0x00007fca9bbe5000)
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fca9ba03000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fca9b8b4000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fca9b899000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fca9b6a7000)
/lib64/ld-linux-x86-64.so.2 (0x00007fcaa89a1000)
libgomp.so.1 => /lib/x86_64-linux-gnu/libgomp.so.1 (0x00007fca9b665000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fca9b658000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fca9b652000)
libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fca9b636000)
可以发现很多是依赖系统的库,这部分库可以不考虑,直接看我们依赖的自己编译的库:
libtorch_cpu.so => /home/jet/infer/installed_lib/torch_lib/lib/libtorch_cpu.so (0x00007fca9eb7c000)
libc10.so => /home/jet/infer/installed_lib/torch_lib/lib/libc10.so (0x00007fca9eb0d000)
libtorch.so => /home/jet/infer/installed_lib/torch_lib/lib/libtorch.so (0x00007fca9eb08000)
libopencv_imgcodecs.so.4.5 => /home/jet/infer/installed_lib/opencv_lib/lib/libopencv_imgcodecs.so.4.5 (0x00007fca9e750000)
libopencv_imgproc.so.4.5 => /home/jet/infer/installed_lib/opencv_lib/lib/libopencv_imgproc.so.4.5 (0x00007fca9cbe5000)
libopencv_core.so.4.5 => /home/jet/infer/installed_lib/opencv_lib/lib/libopencv_core.so.4.5 (0x00007fca9bbe5000)
grpc
因为我们是静态编译的所以没有依赖动态库,重新组织了下文件目录:
(base) jet@jet-vm:~/release$ tree
.
├── lib
│ ├── libc10.so
│ ├── libopencv_core.so -> libopencv_core.so.4.5
│ ├── libopencv_core.so.4.5 -> libopencv_core.so.4.5.4
│ ├── libopencv_core.so.4.5.4
│ ├── libopencv_imgcodecs.so -> libopencv_imgcodecs.so.4.5
│ ├── libopencv_imgcodecs.so.4.5 -> libopencv_imgcodecs.so.4.5.4
│ ├── libopencv_imgcodecs.so.4.5.4
│ ├── libopencv_imgproc.so -> libopencv_imgproc.so.4.5
│ ├── libopencv_imgproc.so.4.5 -> libopencv_imgproc.so.4.5.4
│ ├── libopencv_imgproc.so.4.5.4
│ ├── libtorch_cpu.so
│ └── libtorch.so
├── main
├── pys
│ └── model.pt
└── resource
└── labels.txt
3 directories, 15 files
现在把这些文件和目录拷贝到新的机器上(记得先tar压缩再拷贝,符号链接直接拷贝可能不行):
(base) jet@miihpc:~/self_project/infer_server/release$ ldd main
linux-vdso.so.1 (0x00007ffc139ff000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fcb53366000)
libtorch_cpu.so => not found
libc10.so => not found
libtorch.so => not found
libopencv_imgcodecs.so.4.5 => not found
libopencv_imgproc.so.4.5 => not found
libopencv_core.so.4.5 => not found
libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fcb53182000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fcb53033000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fcb53018000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fcb52e26000)
/lib64/ld-linux-x86-64.so.2 (0x00007fcb540d1000)
(base) jet@miihpc:~/self_project/infer_server/release$
可以发现新机器上这几个缺了:
通过export LD_LIBRARY_PATH
来引入lib
库,这样子就不用动系统的环境了。
(base) jet@miihpc:~/self_project/infer_server/release/lib$ export LD_LIBRARY_PATH=/home/jet/self_project/infer_server/release/lib
(base) jet@miihpc:~/self_project/infer_server/release/lib$ cd ..
(base) jet@miihpc:~/self_project/infer_server/release$ ldd main
linux-vdso.so.1 (0x00007fff2b3c4000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fdbc6147000)
libtorch_cpu.so => /home/jet/self_project/infer_server/release/lib/libtorch_cpu.so (0x00007fdbbd059000)
libc10.so => /home/jet/self_project/infer_server/release/lib/libc10.so (0x00007fdbbcfea000)
libtorch.so => /home/jet/self_project/infer_server/release/lib/libtorch.so (0x00007fdbbcfe5000)
libopencv_imgcodecs.so.4.5 => /home/jet/self_project/infer_server/release/lib/libopencv_imgcodecs.so.4.5 (0x00007fdbbcc2d000)
libopencv_imgproc.so.4.5 => /home/jet/self_project/infer_server/release/lib/libopencv_imgproc.so.4.5 (0x00007fdbbb0c2000)
libopencv_core.so.4.5 => /home/jet/self_project/infer_server/release/lib/libopencv_core.so.4.5 (0x00007fdbba0c2000)
libstdc++.so.6 => /usr/lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007fdbb9ee0000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fdbb9d91000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007fdbb9d76000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fdbb9b84000)
/lib64/ld-linux-x86-64.so.2 (0x00007fdbc6eb2000)
libgomp.so.1 => /usr/lib/x86_64-linux-gnu/libgomp.so.1 (0x00007fdbb9b40000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fdbb9b35000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fdbb9b2f000)
libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fdbb9b13000)
(base) jet@miihpc:~/self_project/infer_server/release$
这时迁移到ubuntu18.04
的就完成了。
3.2 迁移到centos
按照上面的思路迁移到这边的时候发现系统库并不一样:
./main: /lib64/libm.so.6: version `GLIBC_2.29' not found (required by ./main)
./main: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.26' not found (required by ./main)
./main: /lib64/libm.so.6: version `GLIBC_2.29' not found (required by /home/lighthouse/infer/release/lib/libtorch_cpu.so)
./main: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.26' not found (required by /home/lighthouse/infer/release/lib/libtorch_cpu.so)
./main: /lib64/libm.so.6: version `GLIBC_2.29' not found (required by /home/lighthouse/infer/release/lib/libc10.so)
./main: /lib64/libm.so.6: version `GLIBC_2.29' not found (required by /home/lighthouse/infer/release/lib/libopencv_imgcodecs.so.4.5)
./main: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.26' not found (required by /home/lighthouse/infer/release/lib/libopencv_imgcodecs.so.4.5)
./main: /lib64/libm.so.6: version `GLIBC_2.29' not found (required by /home/lighthouse/infer/release/lib/libopencv_imgproc.so.4.5)
./main: /lib64/libm.so.6: version `GLIBC_2.29' not found (required by /home/lighthouse/infer/release/lib/libopencv_core.so.4.5)
./main: /lib64/libstdc++.so.6: version `GLIBCXX_3.4.26' not found (required by /home/lighthouse/infer/release/lib/libopencv_core.so.4.5)
出现到这个问题的时候,系统glibc
不匹配,这时候一般选择是不要升级glibc
因为如果一旦失败就会直接启动不起来。
如果你像我,是一个腾讯云,系统太小还没办法编译,就只能虚拟机装一个一样的系统重新编一下了:
[***@VM-0-4-centos run]$ cat /proc/version
Linux version 4.18.0-*.*.*.el8_2.x86_64 (*@*.*.centos.org) (gcc version 8.3.1 20191121 (Red Hat 8.3.1-5) (GCC)) #1 SMP Thu Oct 22 *:*:* UTC 2020
遇到的问题总结:
问题:通过gen_model.py
生成模型的时候遇到:trace.py:966: TracerWarning: Output nr 1. of the traced function does not match
解答:保存模型前先加上: model.eval()
问题:vscode的错误波浪线不小心给关了怎么办?
解决:./.vscode/settings.json
中C_Cpp.errorSquiggles
改成Enabled
即可。
问题:make install
安装到系统目录中去了,想更新版本怎么办?
解决:是我就点击快照,还原快照了;如果是实体机保险起见,就根据安装信息一个个删了,重新编译再安装吧。
问题:ubuntu编译的迁移到centos,遇到glibc版本不一致的问题
解决:没有办法,换系统重新编
参考文献:
1.CMake编写规则
2.CMake: (四) CMake语法规则
3.cmake设置生成文件的位置
4.OPENCV安装文献
5.libtorch 哪些函数比较常用?
6.OPENCV官方API文档
7.bytes[] 转 Mat的尝试