在边缘计算设备上部署目标检测模型一直是计算机视觉领域的热点需求。瑞芯微RK3588作为一款高性能AIoT芯片,其内置的NPU单元为YOLOv5等模型的实时推理提供了硬件加速支持。本文将详细介绍如何在RK3588平台上通过Python API实现YOLOv5模型的完整推理流程。
这个方案的核心价值在于:
推荐配置:
注意:虽然x86平台也可用于模型转换,但最终推理必须在ARM架构的RK3588设备上运行
RKNN Toolkit是Rockchip提供的模型转换和推理工具链,我们需要安装两个核心组件:
安装步骤:
bash复制# 根据平台选择对应的wheel包
# 对于RK3588设备(aarch64架构):
pip install rknn_toolkit_lite2-1.4.0-cp38-cp38-linux_aarch64.whl
pip install rknn_toolkit2-1.4.0_22dcfef4-cp38-cp38-linux_aarch64.whl
# 验证安装
python -c "from rknn.api import RKNN; print('RKNN import success')"
bash复制pip install opencv-python numpy pycocotools
YOLOv5模型需要转换为RKNN格式才能在RK3588的NPU上运行,标准转换流程为:
code复制PyTorch(.pt) → ONNX(.onnx) → RKNN(.rknn)
转换脚本关键代码:
python复制def convert_to_rknn(model_path, output_path):
rknn = RKNN()
# 模型配置
rknn.config(
mean_values=[[0, 0, 0]],
std_values=[[255, 255, 255]],
target_platform='rk3588')
# 加载ONNX模型
ret = rknn.load_onnx(model=model_path)
# 模型量化(可选,可提升推理速度但可能降低精度)
ret = rknn.build(do_quantization=True, dataset='./dataset.txt')
# 导出RKNN模型
ret = rknn.export_rknn(output_path)
rknn.release()
return ret
python复制class Yolov5RK3588:
def __init__(self, model_path, device_id=None):
self.rknn = RKNN()
self.load_model(model_path, device_id)
self.init_postprocess_params()
def load_model(self, model_path, device_id):
# 加载RKNN模型
ret = self.rknn.load_rknn(model_path)
# 初始化运行时环境
ret = self.rknn.init_runtime(
target='rk3588',
device_id=device_id,
perf_debug=True)
def preprocess(self, img):
# 图像归一化与尺寸调整
img = letter_box(img, self.input_size)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
def inference(self, img):
# 预处理
inputs = self.preprocess(img)
# 执行推理
outputs = self.rknn.inference(inputs=[inputs])
# 后处理
boxes, scores, classes = self.postprocess(outputs)
return boxes, scores, classes
def postprocess(self, outputs):
# 非极大值抑制等后处理逻辑
...
python复制self._modelLock = threading.Lock()
with self._modelLock:
outputs = self.rknn.inference(inputs=[inputs])
python复制def close(self):
if hasattr(self, 'rknn'):
self.rknn.release()
python复制detector = Yolov5RK3588('yolov5s.rknn')
img = cv2.imread('test.jpg')
boxes, scores, classes = detector.inference(img)
for box, score, cls in zip(boxes, scores, classes):
if score > 0.5: # 置信度阈值
x1, y1, x2, y2 = map(int, box)
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.imwrite('result.jpg', img)
python复制cap = cv2.VideoCapture(0)
detector = Yolov5RK3588('yolov5s.rknn')
while True:
ret, frame = cap.read()
if not ret:
break
start = time.time()
boxes, scores, classes = detector.inference(frame)
print(f"Inference time: {(time.time()-start)*1000:.2f}ms")
# 绘制检测结果...
cv2.imshow('Detection', frame)
if cv2.waitKey(1) == ord('q'):
break
detector.close()
RK3588提供三种NPU核心分配模式:
python复制# 单核模式(低功耗)
rknn.init_runtime(core_mask=RKNN.NPU_CORE_0)
# 双核模式(平衡)
rknn.init_runtime(core_mask=RKNN.NPU_CORE_0_1)
# 三核模式(高性能)
rknn.init_runtime(core_mask=RKNN.NPU_CORE_0_1_2)
python复制rknn.config(enable_mem_opt=True)
python复制rknn.config(
quantized_dtype='asymmetric_quantized-8',
quantized_algorithm='normal')
在转换为RKNN前对YOLOv5进行剪枝:
python复制from models.yolo import Model
model = Model('yolov5s.yaml').load('yolov5s.pt')
model.prune() # 自定义剪枝逻辑
问题现象:
code复制E [convert_model_to_rknn:221] Unsupported OP: SiLU
解决方案:
python复制# 在export.py中添加
model.model[-1].act = nn.ReLU() # 替换最后的SiLU
可能原因:
排查步骤:
优化方向:
bash复制python -m rknn.benchmark --model yolov5s.rknn
bash复制echo performance > /sys/devices/platform/fde40000.npu/devfreq/fde40000.npu/governor
python复制class MultiModelInference:
def __init__(self, model_paths):
self.models = [RKNN() for _ in model_paths]
for model, path in zip(self.models, model_paths):
model.load_rknn(path)
model.init_runtime()
def infer(self, inputs):
results = []
for model in self.models:
with threading.Lock():
results.append(model.inference(inputs))
return results
针对特定场景优化NMS参数:
python复制def custom_nms(boxes, scores, iou_thres=0.6, conf_thres=0.4):
# 实现自定义的非极大值抑制
...
python复制def reload_model(self, new_model_path):
with self._modelLock:
self.rknn.release()
self.rknn = RKNN()
self.rknn.load_rknn(new_model_path)
self.rknn.init_runtime()
在实际部署中,这套方案在RK3588上实现了YOLOv5s模型约25FPS的实时推理性能(输入尺寸640x640)。相比纯CPU实现有近5倍的性能提升,同时保持了原始模型95%以上的检测精度。