修正检测节点计算的位置误差，优化检测和计算消耗

2025-09-15 11:03:10 +08:00
parent 7348a99a53
commit 7b194e0d72
5 changed files with 144 additions and 103 deletions
--- a/HiveCoreR0/src/robot_vision/detect_part/detect_part/detect.py
+++ b/HiveCoreR0/src/robot_vision/detect_part/detect_part/detect.py
@@ -8,8 +8,6 @@ import numpy as np
 import transforms3d as tfs
 from cv_bridge import CvBridge

-# from concurrent.futures import ProcessPoolExecutor
-
 import torch
 from detect_part.ultralytics import YOLO

@@ -24,29 +22,22 @@ from interfaces.msg import PoseWithID, PoseWithIDArray
 import detect_part


-# def calculate_pose_for_mask(args):
-#     mask, orig_shape, rgb_img, depth_img, class_id, label, camera_size, K = args
-#     intrinsics = o3d.camera.PinholeCameraIntrinsic(
-#         camera_size[0],
-#         camera_size[1],
-#         K[0],
-#         K[4],
-#         K[2],
-#         K[5]
-#     )
-#     x, y, z, roll, pitch, yaw = calculate_pose(mask, orig_shape, rgb_img, depth_img, intrinsics)
-#     return class_id, label, x, y, z, roll, pitch, yaw
+def get_map(K, D, camera_size):
+    h, w = camera_size[::-1]
+    K = np.array(K).reshape(3, 3)
+    D = np.array(D)
+    new_K, _ = cv2.getOptimalNewCameraMatrix(K, D, (w, h), 1, (w, h))
+    map1, map2 = cv2.initUndistortRectifyMap(K, D, None, new_K, (w, h), cv2.CV_32FC1)
+
+    return map1, map2, new_K.flatten()


 def pca(data, sort=True):
    center = np.mean(data, axis=0)
    centered_points = data - center    # 去中心化

-    # 计算特征值和特征向量
-    # cov_matrix = np.dot(centered_points.T,centered_points)
-    # eigenvectors, eigenvalues, eigenvectors_T = np.linalg.svd(cov_matrix)
    try:
-        cov_matrix = np.cov(centered_points.T)  # 注意要转置
+        cov_matrix = np.cov(centered_points.T)  # 转置
        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
    except np.linalg.LinAlgError:
        return None, None
@@ -58,29 +49,26 @@ def pca(data, sort=True):

    return eigenvalues, eigenvectors

-
-def calculate_pose(mask, orig_shape, rgb_img: np.ndarray, depth_img: np.ndarray, intrinsics):
+def calculate_pose_cpu(mask, depth_img: np.ndarray, intrinsics):
    """计算位态"""
-    mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
    depth_img_mask = np.zeros_like(depth_img)
    depth_img_mask[mask > 0] = depth_img[mask > 0]

-    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))  # 转成毫米整数
-    color_o3d = o3d.geometry.Image(rgb_img.astype(np.uint8))
+    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))

-    rgbd_img = o3d.geometry.RGBDImage.create_from_color_and_depth(
-        color=color_o3d, depth=depth_o3d,
-        depth_scale=1000.0,  # 深度单位换算
-        depth_trunc=3.0,
-        convert_rgb_to_intensity=False
+    point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
+        depth=depth_o3d,
+        intrinsic=intrinsics,
+        depth_scale=1000.0,
+        depth_trunc=5.0,
    )
-    point_cloud = o3d.geometry.PointCloud.create_from_rgbd_image(rgbd_img, intrinsics)
+
    point_cloud = point_cloud.remove_non_finite_points()

    down_pcd = point_cloud.voxel_down_sample(voxel_size=0.02)
-    clean_pcd, ind = down_pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=2.0)
+    clean_pcd, _ = down_pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=2.0)

-    if clean_pcd.points == 0:
+    if len(clean_pcd.points) == 0:
        return 0.0, 0.0, 0.0, None, None, None
    center = clean_pcd.get_center()
    x, y, z = center
@@ -168,10 +156,29 @@ def distortion_correction(color_image, depth_image, map1, map2):
    return undistorted_color, undistorted_depth


+def crop_mask_bbox(depth_img, mask, box):
+    """
+    输入:
+        depth_img: H x W
+        mask: H x W (0/1 或 bool)
+    输出:
+        depth_crop, mask_crop
+    """
+    width, high = depth_img.shape
+    x_center, y_center, w, h = box[:4]
+
+    x_min, x_max = int(round(x_center - w/2)), int(round(x_center + w/2))
+    y_min, y_max = int(round(y_center - h/2)), int(round(y_center + h/2))
+
+    depth_crop = depth_img[max(0, y_min):min(y_max, high), max(0, x_min):min(x_max, width)]
+    mask_crop = mask[max(0, y_min):min(y_max, high), max(0, x_min):min(x_max, width)]
+
+    return depth_crop, mask_crop, (max(0, x_min), max(0, y_min))
+
+
 class DetectNode(Node):
    def __init__(self, name, mode):
        super().__init__(name)
-        self.device = None
        self.checkpoint_path = None
        self.checkpoint_name = None
        self.function = None
@@ -181,8 +188,10 @@ class DetectNode(Node):
        self.map1 = self.map2 = None
        self.intrinsics = None
        self.mode = mode
+        self.device = None
+        self.calculate_function = None
+        self.fx = self.fy = 0.5

-        self.camera_size = []
        self.cv_bridge = CvBridge()

        '''init'''
@@ -196,6 +205,13 @@ class DetectNode(Node):
            self.function = None
            self.get_logger().error('Error: Mode Unknown')

+        if self.device == 'cpu':
+            self.calculate_function = calculate_pose_cpu
+        elif self.device == 'gpu':
+            raise NotImplementedError('Function: calculate_pose_gpu not implemented')
+        else:
+            raise ValueError(f"device must be cpu or gpu, now {self.device}")
+
        self._init_publisher()
        self._init_subscriber()

@@ -213,7 +229,7 @@ class DetectNode(Node):
        self.declare_parameter('output_masks', False)
        self.output_masks = self.get_parameter('output_masks').value

-        self.declare_parameter('set_confidence', 0.5)
+        self.declare_parameter('set_confidence', 0.6)
        self.set_confidence = self.get_parameter('set_confidence').value

        self.declare_parameter('color_image_topic', '/camera/color/image_raw')
@@ -225,11 +241,18 @@ class DetectNode(Node):
        self.declare_parameter('camera_info_topic', '/camera/color/camera_info')
        self.camera_info_topic = self.get_parameter('camera_info_topic').value

+        self.declare_parameter('device', 'cpu')
+        self.device = self.get_parameter('device').value
+
+        self.declare_parameter('width', 1280)
+        self.declare_parameter('high', 720)
+        self.expect_size = [self.get_parameter('width').value, self.get_parameter('high').value]
+
    def _init_model(self):
        """init model"""
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        device_model = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        try:
-            self.model = YOLO(self.checkpoint_path).to(self.device)
+            self.model = YOLO(self.checkpoint_path).to(device_model)
        except Exception as e:
            self.get_logger().error(f'Failed to load YOLO model: {e}')
            raise
@@ -273,18 +296,18 @@ class DetectNode(Node):
        self.K = msg.k
        self.D = msg.d

-        self.camera_size.append(msg.width)
-        self.camera_size.append(msg.height)
+        self.camera_size = [msg.width, msg.height]

        if self.K is not None and self.D is not None:
-            self.map1, self.map2, self.K = self._get_map(msg.k)
+            self.map1, self.map2, self.K = get_map(msg.k, msg.d, self.camera_size)
+            self.scale = [self.expect_size[0] / self.camera_size[0], self.expect_size[1] / self.camera_size[1]]
            self.intrinsics = o3d.camera.PinholeCameraIntrinsic(
-                self.camera_size[0],
-                self.camera_size[1],
-                self.K[0],
-                self.K[4],
-                self.K[2],
-                self.K[5]
+                self.expect_size[0],
+                self.expect_size[1],
+                self.K[0] * self.scale[0],
+                self.K[4] * self.scale[1],
+                self.K[2] * self.scale[0],
+                self.K[5] * self.scale[1]
            )

            if len(self.D) != 0:
@@ -295,19 +318,15 @@ class DetectNode(Node):
        else:
            raise "K and D are not defined"

-    def _get_map(self, K):
-        h, w = self.camera_size[::-1]
-        K = np.array(K).reshape(3, 3)
-        D = np.array(self.D)
-        new_K, _ = cv2.getOptimalNewCameraMatrix(K, D, (w, h), 1, (w, h))
-        map1, map2 = cv2.initUndistortRectifyMap(K, D, None, new_K, (w, h), cv2.CV_32FC1)
-
-        return map1, map2, new_K.flatten()

    def _sync_callback(self, color_img_ros, depth_img_ros):
        """同步回调函数"""
        color_img_cv = self.cv_bridge.imgmsg_to_cv2(color_img_ros, "bgr8")
        depth_img_cv = self.cv_bridge.imgmsg_to_cv2(depth_img_ros, '16UC1')
+        if self.camera_size != self.expect_size:
+            color_img_cv = cv2.resize(color_img_cv, self.expect_size)
+            depth_img_cv = cv2.resize(depth_img_cv, self.expect_size)
+
        color_img_cv, depth_img_cv = distortion_correction(color_img_cv, depth_img_cv, self.map1, self.map2)

        img, pose_dict = self.function(color_img_cv, depth_img_cv)
@@ -338,9 +357,9 @@ class DetectNode(Node):
        pose_dict = defaultdict(list)

        '''Get Predict Results'''
-        # time1 = time.time()
+        time1 = time.time()
        results = self.model(rgb_img)
-        # time2 = time.time()
+        time2 = time.time()
        result = results[0]

        '''Get masks'''
@@ -350,50 +369,52 @@ class DetectNode(Node):
        orig_shape = result.masks.orig_shape

        '''Get boxes'''
+        boxes = result.boxes.data.cpu().numpy()
        confidences = result.boxes.conf.cpu().numpy()
        class_ids = result.boxes.cls.cpu().numpy()
        labels = result.names

-        # time3 = time.time()
+        time3 = time.time()

-        # tasks = [
-        #     (mask, orig_shape, rgb_img, depth_img, int(class_ids[i]), labels[class_ids[i]], self.camera_size, self.K)
-        #     for i, (mask, conf) in enumerate(zip(masks, confidences))
-        #     if conf >= self.set_confidence  # 置信度低的不加入任务
-        # ]
-        # with ProcessPoolExecutor(max_workers=4) as executor:
-        #     results = list(executor.map(calculate_pose_for_mask, tasks))
-        #
-        # for res in results:
-        #     class_id, label, x, y, z, roll, pitch, yaw = res
-        #     pose = Pose()
-        #     pose.position = Point(x=x, y=y, z=z)
-        #     pose.orientation = Quaternion(x=roll, y=pitch, z=yaw)
-        #     pose_dict[(class_id, label)].append(pose)
-
-        for i, mask in enumerate(masks):
+        for i, (mask, box) in enumerate(zip(masks, boxes)):
            '''Confidence Filter'''
            if confidences[i] >= self.set_confidence:
-                x, y, z, roll, pitch, yaw = calculate_pose(mask, orig_shape, rgb_img, depth_img, self.intrinsics)
+                mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
+
+                depth_crop, mask_crop, (x_min, y_min) = crop_mask_bbox(depth_img, mask, box)
+                if depth_crop is not None:
+                    continue
+                mask_crop = cv2.resize(mask_crop, None, fx=self.fx, fy=self.fy, interpolation=cv2.INTER_NEAREST)
+                depth_crop = cv2.resize(depth_crop, None, fx=self.fx, fy=self.fy, interpolation=cv2.INTER_NEAREST)
+                intrinsics = o3d.camera.PinholeCameraIntrinsic(
+                    int(self.expect_size[0] * self.fx),
+                    int(self.expect_size[1] * self.fy),
+                    self.K[0] * self.scale[0] * self.fx,
+                    self.K[4] * self.scale[1] * self.fy,
+                    (self.K[2] - x_min) * self.scale[0] * self.fx,
+                    (self.K[5] - y_min) * self.scale[1] * self.fy
+                )
+                x, y, z, roll, pitch, yaw = self.calculate_function(mask_crop, depth_crop, intrinsics)
+
                if (x, y, z) != (0.0, 0.0, 0.0):
                    pose = Pose()
                    pose.position = Point(x=x, y=y, z=z)
                    pose.orientation = Quaternion(x=roll, y=pitch, z=yaw)
                    pose_dict[int(class_ids[i]), labels[class_ids[i]]].append(pose)

-        # time4 = time.time()
+        time4 = time.time()

        '''mask_img and box_img is or not output'''
        if self.output_boxes and not self.output_masks:
            draw_box(self.set_confidence, rgb_img, result)
-            # time5 = time.time()
-            # self.get_logger().info(f'start')
-            # self.get_logger().info(f'{(time2 - time1)*1000} ms, model predict')
-            # self.get_logger().info(f'{(time3 - time2)*1000} ms, get mask and some param')
-            # self.get_logger().info(f'{(time4 - time3)*1000} ms, calculate all mask PCA')
-            # self.get_logger().info(f'{(time5 - time4)*1000} ms, draw box')
-            # self.get_logger().info(f'{(time5 - time1)*1000} ms, completing a picture entire process')
-            # self.get_logger().info(f'end')
+            time5 = time.time()
+            self.get_logger().info(f'start')
+            self.get_logger().info(f'{(time2 - time1)*1000} ms, model predict')
+            self.get_logger().info(f'{(time3 - time2)*1000} ms, get mask and some param')
+            self.get_logger().info(f'{(time4 - time3)*1000} ms, calculate all mask PCA')
+            self.get_logger().info(f'{(time5 - time4)*1000} ms, draw box')
+            self.get_logger().info(f'{(time5 - time1)*1000} ms, completing a picture entire process')
+            self.get_logger().info(f'end')
            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
        elif self.output_boxes and self.output_masks:
            draw_box(self.set_confidence, rgb_img, result)
@@ -431,7 +452,8 @@ class DetectNode(Node):

            rgb_img[mask > 0] = rgb_img[mask > 0] * 0.5 + np.array([0, 0, 255]) * 0.5
            orig_shape = rgb_img_gray.shape
-            x, y, z, roll, pitch, yaw = calculate_pose(mask, orig_shape, rgb_img,depth_img, self.intrinsics)
+            mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
+            x, y, z, roll, pitch, yaw = self.calculate_function(mask, depth_img, self.intrinsics)

            if (x, y, z) != (0.0, 0.0, 0.0):
                pose = Pose()
@@ -457,6 +479,7 @@ def main(args=None):
        node.destroy_node()
        rclpy.shutdown()

+
 def crossboard_detect(args=None):
    rclpy.init(args=args)
    node = DetectNode('detect', 'test')
--- a/HiveCoreR0/src/robot_vision/detect_part/launch/detect.launch.py
+++ b/HiveCoreR0/src/robot_vision/detect_part/launch/detect.launch.py
@@ -1,22 +1,4 @@
-# Copyright 2023 Intel Corporation. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Launch realsense2_camera node."""
-import os
-import yaml
 from launch import LaunchDescription
-import launch_ros.actions
 from launch_ros.actions import Node
 from launch.actions import DeclareLaunchArgument, OpaqueFunction, LogInfo
 from launch.substitutions import LaunchConfiguration
@@ -31,6 +13,9 @@ def generate_launch_description():
        DeclareLaunchArgument('color_image_topic', default_value='/camera/camera/color/image_raw'),
        DeclareLaunchArgument('depth_image_topic', default_value='/camera/camera/depth/image_rect_raw'),
        DeclareLaunchArgument('camera_info_topic', default_value='/camera/camera/color/camera_info'),
+        DeclareLaunchArgument('width', default_value='1280'),
+        DeclareLaunchArgument('high', default_value='720'),
+        DeclareLaunchArgument('device', default_value='cpu')
    ]

    def create_detect_node(context):
@@ -41,6 +26,9 @@ def generate_launch_description():
        color_image_topic = LaunchConfiguration('color_image_topic').perform(context)
        depth_image_topic = LaunchConfiguration('depth_image_topic').perform(context)
        camera_info_topic = LaunchConfiguration('camera_info_topic').perform(context)
+        width = LaunchConfiguration('width').perform(context)
+        high = LaunchConfiguration('high').perform(context)
+        device = LaunchConfiguration('device').perform(context)

        return [
            Node(
@@ -54,6 +42,9 @@ def generate_launch_description():
                    'color_image_topic': color_image_topic,
                    'depth_image_topic': depth_image_topic,
                    'camera_info_topic': camera_info_topic,
+                    'width': int(width),
+                    'high': int(high),
+                    'device': device,
                }]
            )
        ]
--- a/HiveCoreR0/src/robot_vision/detect_part/launch/gemini_crossboard.launch.py
+++ b/HiveCoreR0/src/robot_vision/detect_part/launch/gemini_crossboard.launch.py
@@ -58,6 +58,9 @@ def generate_launch_description():
        DeclareLaunchArgument('color_image_topic', default_value='/camera/color/image_raw'),
        DeclareLaunchArgument('depth_image_topic', default_value='/camera/depth/image_raw'),
        DeclareLaunchArgument('camera_info_topic', default_value='/camera/color/camera_info'),
+        DeclareLaunchArgument('width', default_value='1280'),
+        DeclareLaunchArgument('high', default_value='720'),
+        DeclareLaunchArgument('device', default_value='cpu')
    ]

    args_camera = [
@@ -318,6 +321,9 @@ def generate_launch_description():
        output_boxes = LaunchConfiguration('output_boxes').perform(context)
        output_masks = LaunchConfiguration('output_masks').perform(context)
        conf = LaunchConfiguration('set_confidence').perform(context)
+        width = LaunchConfiguration('width').perform(context)
+        high = LaunchConfiguration('high').perform(context)
+        device = LaunchConfiguration('device').perform(context)

        return [
            Node(
@@ -327,7 +333,10 @@ def generate_launch_description():
                    'checkpoint_name': checkpoint,
                    'output_boxes': output_boxes.lower() == 'true',
                    'output_masks': output_masks.lower() == 'true',
-                    'set_confidence': float(conf)
+                    'set_confidence': float(conf),
+                    'width': int(width),
+                    'high': int(high),
+                    'device': device
                }]
            )
        ]
--- a/HiveCoreR0/src/robot_vision/detect_part/launch/gemini_detect.launch.py
+++ b/HiveCoreR0/src/robot_vision/detect_part/launch/gemini_detect.launch.py
@@ -54,10 +54,13 @@ def generate_launch_description():
        DeclareLaunchArgument('checkpoint_name', default_value='yolo11s-seg.pt'),
        DeclareLaunchArgument('output_boxes', default_value='True'),
        DeclareLaunchArgument('output_masks', default_value='False'),
-        DeclareLaunchArgument('set_confidence', default_value='0.6'),
+        DeclareLaunchArgument('set_confidence', default_value='0.2'),
        DeclareLaunchArgument('color_image_topic', default_value='/camera/color/image_raw'),
        DeclareLaunchArgument('depth_image_topic', default_value='/camera/depth/image_raw'),
        DeclareLaunchArgument('camera_info_topic', default_value='/camera/color/camera_info'),
+        DeclareLaunchArgument('width', default_value='1280'),
+        DeclareLaunchArgument('high', default_value='720'),
+        DeclareLaunchArgument('device', default_value='cpu')
    ]

    args_camera = [
@@ -321,6 +324,9 @@ def generate_launch_description():
        color_image_topic = LaunchConfiguration('color_image_topic').perform(context)
        depth_image_topic = LaunchConfiguration('depth_image_topic').perform(context)
        camera_info_topic = LaunchConfiguration('camera_info_topic').perform(context)
+        width = LaunchConfiguration('width').perform(context)
+        high = LaunchConfiguration('high').perform(context)
+        device = LaunchConfiguration('device').perform(context)

        return [
            Node(
@@ -334,6 +340,9 @@ def generate_launch_description():
                    'color_image_topic': color_image_topic,
                    'depth_image_topic': depth_image_topic,
                    'camera_info_topic': camera_info_topic,
+                    'width': int(width),
+                    'high': int(high),
+                    'device': device
                }]
            )
        ]
--- a/HiveCoreR0/src/robot_vision/detect_part/launch/realsense_detect.launch.py
+++ b/HiveCoreR0/src/robot_vision/detect_part/launch/realsense_detect.launch.py
@@ -150,6 +150,9 @@ def generate_launch_description():
        DeclareLaunchArgument('color_image_topic', default_value='/camera/camera/color/image_raw'),
        DeclareLaunchArgument('depth_image_topic', default_value='/camera/camera/depth/image_rect_raw'),
        DeclareLaunchArgument('camera_info_topic', default_value='/camera/camera/color/camera_info'),
+        DeclareLaunchArgument('width', default_value='1280'),
+        DeclareLaunchArgument('high', default_value='720'),
+        DeclareLaunchArgument('device', default_value='cpu')
    ]

    def create_detect_node(context):
@@ -160,6 +163,9 @@ def generate_launch_description():
        color_image_topic = LaunchConfiguration('color_image_topic').perform(context)
        depth_image_topic = LaunchConfiguration('depth_image_topic').perform(context)
        camera_info_topic = LaunchConfiguration('camera_info_topic').perform(context)
+        width = LaunchConfiguration('width').perform(context)
+        high = LaunchConfiguration('high').perform(context)
+        device = LaunchConfiguration('device').perform(context)

        return [
            Node(
@@ -173,6 +179,9 @@ def generate_launch_description():
                    'color_image_topic': color_image_topic,
                    'depth_image_topic': depth_image_topic,
                    'camera_info_topic': camera_info_topic,
+                    'width': int(width),
+                    'high': int(high),
+                    'device': device
                }]
            )
        ]