compliete initbase

Merge branch 'feature' into feature_cpp_test
修改抓取位置微调
2026-01-20 19:58:14 +08:00 · 2026-01-19 14:09:29 +08:00 · 2026-01-19 14:06:09 +08:00 · 2026-01-15 17:39:45 +08:00 · 2026-01-12 09:58:05 +08:00 · 2026-01-12 09:57:30 +08:00
1952 changed files with 405801 additions and 3578 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
 **/build/
 **/install/
 **/log/
+**/VisionDetect/**/**.so
--- a/calibration_mat/eye_in_left_hand.json
+++ b/calibration_mat/eye_in_left_hand.json
@@ -0,0 +1,28 @@
+{
+    "T": [
+        [
+            -0.029312906184646676,
+            0.9941492398859674,
+            0.10396173510077301,
+            -0.0809046635564155
+        ],
+        [
+            -0.9993360095995784,
+            -0.02689542239474161,
+            -0.024579995358005674,
+            0.036195409684615196
+        ],
+        [
+            -0.021640088923136416,
+            -0.10461321660460321,
+            0.9942775173275502,
+            0.04913468358915477
+        ],
+        [
+            0.0,
+            0.0,
+            0.0,
+            1.0
+        ]
+    ]
+}
--- a/calibration_mat/eye_in_right_hand.json
+++ b/calibration_mat/eye_in_right_hand.json
@@ -1,22 +1,22 @@
 {
    "T": [
        [
-            0.013635791720580838,
-            0.9992765703636767,
-            -0.03550212819481636,
-            -0.08645650535173793
+            0.02515190926712163,
+            0.9984434279780899,
+            -0.049780544267610596,
+            -0.08312977955463981
        ],
        [
-            -0.9998865276218899,
-            0.013399556063222661,
-            -0.006883587549256321,
-            0.036196137264974
+            -0.9996337211449028,
+            0.025617085950903107,
+            0.008728599966714646,
+            0.059044674332170574
        ],
        [
-            -0.006402895000908794,
-            0.03559196285001416,
-            0.999345893630474,
-            0.014407883180676354
+            0.00999024575340213,
+            0.04954276975245833,
+            0.9987220378839358,
+            0.017378234075134728
        ],
        [
            0.0,
--- a/calibration_mat/eye_to_hand.json
+++ b/calibration_mat/eye_to_hand.json
@@ -0,0 +1,28 @@
+{
+    "T": [
+        [
+            0.01868075138315295,
+            0.8881359544145228,
+            0.45920099738999437,
+            -0.19284748723617093
+        ],
+        [
+            0.002854261818715398,
+            -0.4593266423892484,
+            0.8882628489252997,
+            0.057154511710361816
+        ],
+        [
+            0.9998214254141742,
+            -0.01528273756969839,
+            -0.011115539354645598,
+            -0.04187423675125192
+        ],
+        [
+            0.0,
+            0.0,
+            0.0,
+            1.0
+        ]
+    ]
+}
--- a/tools/calibration_tools.py
+++ b/tools/calibration_tools.py
@@ -192,13 +192,13 @@ class _Calibration:
                    np.linalg.inv(
                        self._function(
                            self._hand[i], self._hand[i + 1], self._hand[i + 2],
-                            self._hand[i + 3], self._hand[i + 4], self._hand[i + 5], self._hand[i + 6]
+                            self._hand[i + 6], self._hand[i + 3], self._hand[i + 4], self._hand[i + 5]
                        )
                    )
                    if self._mode == 'eye_to_hand' else
                    self._function(
                        self._hand[i], self._hand[i + 1], self._hand[i + 2],
-                        self._hand[i + 3], self._hand[i + 4], self._hand[i + 5], self._hand[i + 6]
+                        self._hand[i + 6], self._hand[i + 3], self._hand[i + 4], self._hand[i + 5]
                    )
                )
                self._Hcs.append(
--- a/tools/error_calculation_tool.py
+++ b/tools/error_calculation_tool.py
@@ -52,7 +52,7 @@ def calculate(mode, hand_path, camera_path, hand_eye_path):
    for i in range(sum_):
        hand = [hand_all[i*7], hand_all[i*7+1], hand_all[i*7+2], hand_all[i*7+3], hand_all[i*7+4], hand_all[i*7+5], hand_all[i*7+6]]
        camera = [camera_all[i*7], camera_all[i*7+1], camera_all[i*7+2], camera_all[i*7+3], camera_all[i*7+4], camera_all[i*7+5], camera_all[i*7+6]]
-        T_hand_in_base = get_matrix_quat(hand[0], hand[1], hand[2], hand[3], hand[4], hand[5], hand[6])
+        T_hand_in_base = get_matrix_quat(hand[0], hand[1], hand[2], hand[6], hand[3], hand[4], hand[5])
        T_cal_in_camera = get_matrix_quat(camera[0], camera[1], camera[2], camera[3], camera[4], camera[5], camera[6])

        if mode == "eye_to_hand":
--- a/tools/pt2vino.py
+++ b/tools/pt2vino.py
@@ -0,0 +1,24 @@
+import argparse
+
+from ultralytics import YOLO
+
+
+def main(checkpoint_path):
+    model = YOLO(checkpoint_path)
+    model.export(
+        format="openvino",
+        imgsz=(1280, 720),
+        dynamic=True,
+        simplify=True,
+        half=True,
+        workspace=0.8,
+        batch=1,
+        device="cpu"
+    )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_path", type=str)
+    args = parser.parse_args()
+    main(args.checkpoint_path)
+    # main(checkpoint_path="checkpoint/medical_sense-seg.pt")
--- a/vision_detect/checkpoints/hivecorebox-seg.onnx
+++ b/vision_detect/checkpoints/hivecorebox-seg.onnx
--- a/vision_detect/checkpoints/medical_sense-seg.pt
+++ b/vision_detect/checkpoints/medical_sense-seg.pt
--- a/vision_detect/checkpoints/yolo11s-seg.onnx
+++ b/vision_detect/checkpoints/yolo11s-seg.onnx
--- a/vision_detect/configs/error_configs/report_logging_define.json
+++ b/vision_detect/configs/error_configs/report_logging_define.json
@@ -0,0 +1,29 @@
+{
+  "info": {},
+
+  "warring": {
+    "0000": "Success",
+
+    "1000": "Detected object count is 0",
+    "1001": "Depth crop is None",
+    "1003": "Failed to detect a valid pose",
+
+    "1100": "Object point cloud contains excessive noise",
+    "1101": "The point cloud is empty",
+
+    "1200": "The number of points is insufficient to compute an OBB",
+    "1201": "PCA output vector is None",
+    "1202": "This pose cannot be grab, and position refine fail",
+
+    "1300": "E2E model input data 'coors' are fewer than 128",
+    "1301": "E2E model input data 'point_clouds' are fewer than 128",
+    "1302": "The 'true num' of points is 0; No graspable points are available",
+    "1303": "The model returned no predictions",
+    "1304": "All rotation vector processing failed; no valid rotation matrix was generated"
+  },
+
+  "error": {},
+
+  "fatal": {}
+}
+
--- a/vision_detect/configs/launch_configs/default_config.json
+++ b/vision_detect/configs/launch_configs/default_config.json
@@ -48,5 +48,12 @@
        "ransac_voxel_size": 0.005,
        "icp_voxel_radius": [0.004, 0.002, 0.001],
        "icp_max_iter": [50, 30, 14]
+    },
+    "E2E_configs": {
+        "checkpoint_path": "checkpoints/posenet.pt",
+        "depth_scale": 1000.0,
+        "depth_trunc": 3.0,
+        "voxel_size": 0.010,
+        "collision_thresh": 0.01
    }
 }
--- a/vision_detect/configs/launch_configs/medical_sense.json
+++ b/vision_detect/configs/launch_configs/medical_sense.json
@@ -1,5 +1,5 @@
 {
-    "node_name": "bottle_detect_service",
+    "node_name": "medical_sense_service",
    "output_boxes": "True",
    "output_masks": "False",

@@ -17,7 +17,7 @@
    "detect_mode": "Detect",
    "Detect_configs": {
        "checkpoint_path": "checkpoints/medical_sense-seg.pt",
-        "confidence": 0.50,
+        "confidence": 0.75,
        "classes": []
    },

@@ -25,6 +25,13 @@
    "PCA_configs": {
        "depth_scale": 1000.0,
        "depth_trunc": 3.0,
-        "voxel_size": 0.010
+        "voxel_size": 0.005
+    },
+    "E2E_configs": {
+        "checkpoint_path": "checkpoints/posenet.pt",
+        "depth_scale": 1000.0,
+        "depth_trunc": 3.0,
+        "voxel_size": 0.0005,
+        "collision_thresh": 0.01
    }
 }
--- a/vision_detect/launch/crossboard.launch.py
+++ b/vision_detect/launch/crossboard.launch.py
@@ -1,47 +0,0 @@
-from launch import LaunchDescription
-from launch_ros.actions import Node
-from launch.actions import DeclareLaunchArgument, OpaqueFunction
-from launch.substitutions import LaunchConfiguration
-
-import os
-import ast
-import json
-from ament_index_python.packages import get_package_share_directory
-
-
-share_dir = get_package_share_directory('vision_detect')
-
-
-def generate_launch_description():
-    args_detect = [
-        DeclareLaunchArgument('output_boxes', default_value='True'),
-        DeclareLaunchArgument('output_masks', default_value='False'),
-        DeclareLaunchArgument('color_image_topic', default_value='/camera/color/image_raw'),
-        DeclareLaunchArgument('depth_image_topic', default_value='/camera/depth/image_raw'),
-        DeclareLaunchArgument('camera_info_topic', default_value='/camera/color/camera_info'),
-    ]
-
-    def create_detect_node(context):
-        output_boxes = LaunchConfiguration('output_boxes').perform(context)
-        output_masks = LaunchConfiguration('output_masks').perform(context)
-        color_image_topic = LaunchConfiguration('color_image_topic').perform(context)
-        depth_image_topic = LaunchConfiguration('depth_image_topic').perform(context)
-        camera_info_topic = LaunchConfiguration('camera_info_topic').perform(context)
-
-        return [
-            Node(
-                package='vision_detect',
-                executable='crossboard_detect_node',
-                parameters=[{
-                    'output_boxes': output_boxes.lower() == 'true',
-                    'output_masks': output_masks.lower() == 'true',
-                    'color_image_topic': color_image_topic,
-                    'depth_image_topic': depth_image_topic,
-                    'camera_info_topic': camera_info_topic,
-                }]
-            )
-        ]
-
-    return LaunchDescription(args_detect + [
-        OpaqueFunction(function=create_detect_node),
-    ])
--- a/vision_detect/launch/detect_service.launch.py
+++ b/vision_detect/launch/detect_service.launch.py
@@ -1,49 +0,0 @@
-from launch import LaunchDescription
-from launch_ros.actions import Node
-from launch.actions import DeclareLaunchArgument, OpaqueFunction
-from launch.substitutions import LaunchConfiguration
-
-import os
-import ast
-import json
-from ament_index_python.packages import get_package_share_directory
-
-
-share_dir = get_package_share_directory('vision_detect')
-config_dir = os.path.join(share_dir, 'configs/launch_configs/detect_service.json')
-with open(config_dir, "r") as f:
-    configs = json.load(f)
-
-def generate_launch_description():
-    args_detect = [
-        DeclareLaunchArgument('checkpoint_name', default_value=configs['checkpoint_name']),
-        DeclareLaunchArgument('output_boxes', default_value=configs['output_boxes']),
-        DeclareLaunchArgument('output_masks', default_value=configs['output_masks']),
-        DeclareLaunchArgument('set_confidence', default_value=configs['set_confidence']),
-        DeclareLaunchArgument('classes', default_value=configs['classes']),
-    ]
-
-    def create_detect_node(context):
-        checkpoint = LaunchConfiguration('checkpoint_name').perform(context)
-        output_boxes = LaunchConfiguration('output_boxes').perform(context)
-        output_masks = LaunchConfiguration('output_masks').perform(context)
-        conf = LaunchConfiguration('set_confidence').perform(context)
-        classes = LaunchConfiguration('classes').perform(context)
-
-        return [
-            Node(
-                package='vision_detect',
-                executable='detect_service_node',
-                parameters=[{
-                    'checkpoint_name': checkpoint,
-                    'output_boxes': output_boxes.lower() == 'true',
-                    'output_masks': output_masks.lower() == 'true',
-                    'set_confidence': float(conf),
-                    'classes': classes,
-                }]
-            )
-        ]
-
-    return LaunchDescription(args_detect + [
-        OpaqueFunction(function=create_detect_node),
-    ])
--- a/vision_detect/launch/medical_sense.launch.py
+++ b/vision_detect/launch/medical_sense.launch.py
@@ -26,6 +26,7 @@ def generate_launch_description():
                package='vision_detect',
                executable='detect_node',
                name=configs['node_name'],
+                output="screen",
                parameters=[{
                    'configs_path': configs_path,
                }]
--- a/vision_detect/setup.py
+++ b/vision_detect/setup.py
@@ -1,26 +1,47 @@
+import os
 from glob import glob
 from setuptools import find_packages, setup

 package_name = 'vision_detect'

+openvino_files = []
+for model_dir in glob("checkpoints/*_openvino_model"):
+    model_name = os.path.basename(model_dir)
+    model_files = glob(os.path.join(model_dir, '*'))
+    openvino_files.append(
+        ('share/' + package_name + '/checkpoints/' + model_name, model_files)
+    )
+
+data_files = [
+    ('share/ament_index/resource_index/packages', ['resource/' + package_name]),
+    ('share/' + package_name, ['package.xml']),
+    ('share/' + package_name + '/launch', glob('launch/*.launch.py')),
+    ('share/' + package_name + '/configs', glob('configs/*.json')),
+    ('share/' + package_name + '/configs/flexiv_configs',
+     glob('configs/flexiv_configs/*.json')),
+    ('share/' + package_name + '/configs/hand_eye_mat', glob('configs/hand_eye_mat/*.json')),
+    ('share/' + package_name + '/configs/launch_configs',
+     glob('configs/launch_configs/*.json')),
+    ('share/' + package_name + '/configs/error_configs', glob('configs/error_configs/*.json')),
+
+    ('share/' + package_name + '/checkpoints', glob('checkpoints/*.pt')),
+    ('share/' + package_name + '/checkpoints', glob('checkpoints/*.onnx')),
+    ('share/' + package_name + '/checkpoints', glob('checkpoints/*.engine')),
+    ('share/' + package_name + '/pointclouds', glob('pointclouds/*.pcd')),
+
+    ('lib/python3.10/site-packages/' + package_name + '/VisionDetect/net/pointnet2/pointnet2',
+     glob('vision_detect/VisionDetect/net/pointnet2/pointnet2/*.so')),
+    ('lib/python3.10/site-packages/' + package_name + '/VisionDetect/net/knn/knn_pytorch',
+     glob('vision_detect/VisionDetect/net/knn/knn_pytorch/*.so'))
+    ]
+
+data_files.extend(openvino_files)
+
 setup(
    name=package_name,
    version='0.0.0',
    packages=find_packages(exclude=['test']),
-    data_files=[
-        ('share/ament_index/resource_index/packages', ['resource/' + package_name]),
-        ('share/' + package_name, ['package.xml']),
-        ('share/' + package_name + '/launch', glob('launch/*.launch.py')),
-        ('share/' + package_name + '/configs', glob('configs/*.json')),
-        ('share/' + package_name + '/configs/flexiv_configs', glob('configs/flexiv_configs/*.json')),
-        ('share/' + package_name + '/configs/hand_eye_mat', glob('configs/hand_eye_mat/*.json')),
-        ('share/' + package_name + '/configs/launch_configs', glob('configs/launch_configs/*.json')),
-
-        ('share/' + package_name + '/checkpoints', glob('checkpoints/*.pt')),
-        ('share/' + package_name + '/checkpoints', glob('checkpoints/*.onnx')),
-        ('share/' + package_name + '/checkpoints', glob('checkpoints/*.engine')),
-        ('share/' + package_name + '/pointclouds', glob('pointclouds/*.pcd')),
-    ],
+    data_files=data_files,
    install_requires=['setuptools'],
    zip_safe=True,
    include_package_data=True,
@@ -31,10 +52,6 @@ setup(
    tests_require=['pytest'],
    entry_points={
        'console_scripts': [
-            # 'detect_service_node = vision_detect.detect_service:main',
-            # 'detect_topic_node = vision_detect.detect_topic:main',
-
-            # 'box_detect_service_node = vision_detect.detect_box_service:main',
            # 'red_detect_topic_node = vision_detect.detect_red_topic:main',
            # 'red_detect_service_node = vision_detect.detect_red_service:main',

@@ -42,11 +59,8 @@ setup(
            'flexiv_detect_service_node = vision_detect.flexivaidk_detect_service:main',

            'sub_pose_node = vision_detect.sub_pose:main',
-            # 'calibration_node = vision_detect.hand_eye_calibration:main',
-            # 'crossboard_detect_node = vision_detect.crossboard_detect:main',
            'service_client_node = vision_detect.service_client:main',
-            # 'get_camera_pose_node = vision_detect.get_camera_pose:main',
-            # 'calculate_node = vision_detect.calculate:main',
+            'get_camera_pose_node = vision_detect.get_camera_pose:main',
            
            'detect_node = vision_detect.detect_node:main',
        ],
--- a/vision_detect/vision_detect/VisionDetect/net/init.py
+++ b/vision_detect/vision_detect/VisionDetect/net/init.py
@@ -0,0 +1 @@
+from .gsnet import *
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/init.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/init.py
@@ -0,0 +1,7 @@
+from .models.graspnet import *
+from .utils.collision_detector import *
+
+
+__all__ = [
+    "GraspNet", "pred_decode", "ModelFreeCollisionDetector"
+]
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/gsnet_pose_calculate.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/gsnet_pose_calculate.py
@@ -0,0 +1,132 @@
+import os
+import sys
+import numpy as np
+import argparse
+from PIL import Image
+import torch
+import open3d as o3d
+
+ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(ROOT_DIR)
+
+from models.graspnet import GraspNet, pred_decode
+import collections.abc as container_abcs
+import MinkowskiEngine as ME
+from utils.collision_detector import ModelFreeCollisionDetector
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--checkpoint_path', default='checkpoints/minkuresunet_realsense.tar')
+parser.add_argument('--dump_dir', help='Dump dir to save outputs', default='results/')
+parser.add_argument('--seed_feat_dim', default=512, type=int, help='Point wise feature dim')
+parser.add_argument('--collision_thresh', type=float, default=0.01,
+                    help='Collision Threshold in collision detection [default: 0.01]')
+parser.add_argument('--voxel_size_cd', type=float, default=0.01,
+                    help='Voxel Size for collision detection')
+parser.add_argument('--infer', action='store_true', default=True)
+parser.add_argument('--vis', action='store_true', default=True)
+cfgs = parser.parse_args()
+
+
+# ------------------------------------------------------------------------- GLOBAL CONFIG BEG
+# if not os.path.exists(cfgs.dump_dir):
+#     os.mkdir(cfgs.dump_dir)
+
+def minkowski_collate_fn(list_data):
+    coordinates_batch, features_batch = ME.utils.sparse_collate([d["coors"] for d in list_data],
+                                                                [d["feats"] for d in list_data])
+    coordinates_batch = np.ascontiguousarray(coordinates_batch, dtype=np.int32)
+    coordinates_batch, features_batch, _, quantize2original = ME.utils.sparse_quantize(
+        coordinates_batch, features_batch, return_index=True, return_inverse=True)
+    res = {
+        "coors": coordinates_batch,
+        "feats": features_batch,
+        "quantize2original": quantize2original
+    }
+
+    def collate_fn_(batch):
+        if type(batch[0]).__module__ == 'numpy':
+            return torch.stack([torch.from_numpy(b) for b in batch], 0)
+        elif isinstance(batch[0], container_abcs.Sequence):
+            return [[torch.from_numpy(sample) for sample in b] for b in batch]
+        elif isinstance(batch[0], container_abcs.Mapping):
+            for key in batch[0]:
+                if key == 'coors' or key == 'feats':
+                    continue
+                res[key] = collate_fn_([d[key] for d in batch])
+            return res
+    res = collate_fn_(list_data)
+
+    return res
+
+
+def data_process(pcd:o3d.geometry.PointCloud, voxel_size:float = 0.005):
+    index = 0
+    camera_poses = np.array([np.eye(4)]) # 相机位姿
+    align_mat = np.eye(4) # Camera_0 相对桌面的位姿
+    trans = np.dot(align_mat, camera_poses[int(index)])
+    pcd.transform(trans)
+    points = np.asarray(pcd.points)
+
+    ret_dict = {
+        'point_clouds': points.astype(np.float32),
+        'coors': points.astype(np.float32) / voxel_size,
+        'feats': np.ones_like(points).astype(np.float32),
+    }
+    return ret_dict
+
+
+def get_grasp_dict(preds):
+    grasp_dict = {
+            "score": preds[:, 0],
+            "width": preds[:, 1],
+            "height": preds[:, 2],
+            "depth": preds[:, 3],
+            "rotation": preds[:, 4:13].reshape(-1, 3, 3),
+            "translation": preds[:, 13:16].reshape(-1, 3),
+            "object_id": preds[:, 16]
+        }
+    return grasp_dict
+
+
+def inference(pcd:o3d.geometry.PointCloud, voxel_size:float = 0.005):
+    data_input = data_process(pcd, voxel_size)
+    batch_data = minkowski_collate_fn([data_input])
+    net = GraspNet(seed_feat_dim=512, is_training=False)
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    # device = torch.device("cpu")
+    net.to(device)
+    # Load checkpoint
+    checkpoint_path = os.path.join(ROOT_DIR, cfgs.checkpoint_path)
+    net.load_state_dict(checkpoint_path)
+
+    net.eval()
+
+    for key in batch_data:
+        batch_data[key] = batch_data[key].to(device)
+    # Forward pass
+    with torch.no_grad():
+        end_points = net(batch_data)
+        grasp_preds = pred_decode(end_points)
+
+    preds = grasp_preds[0].detach().cpu().numpy()
+
+    sorted_index = np.argsort(-preds[:, 0])
+    preds = preds[sorted_index]
+    preds = preds[:10]
+
+    # collision detection
+    if cfgs.collision_thresh > 0:
+        cloud = data_input['point_clouds']
+        mfcdetector = ModelFreeCollisionDetector(cloud, voxel_size=cfgs.voxel_size_cd)
+        collision_mask = mfcdetector.detect(get_grasp_dict(preds), approach_dist=0.05,
+                                            collision_thresh=cfgs.collision_thresh)
+        preds = preds[~collision_mask]
+
+    return preds
+
+
+if __name__ == '__main__':
+    data_dict = data_process()
+    inference(data_dict)
+
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/init.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/init.py
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/pycache/backbone_resunet14.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/pycache/backbone_resunet14.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/pycache/graspnet.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/pycache/graspnet.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/pycache/modules.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/pycache/modules.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/pycache/resnet.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/pycache/resnet.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/backbone_resunet14.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/backbone_resunet14.py
@@ -0,0 +1,224 @@
+import MinkowskiEngine as ME
+from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
+from .resnet import ResNetBase
+
+
+class MinkUNetBase(ResNetBase):
+    BLOCK = None
+    PLANES = None
+    DILATIONS = (1, 1, 1, 1, 1, 1, 1, 1)
+    LAYERS = (2, 2, 2, 2, 2, 2, 2, 2)
+    PLANES = (32, 64, 128, 256, 256, 128, 96, 96)
+    INIT_DIM = 32
+    OUT_TENSOR_STRIDE = 1
+
+    # To use the model, must call initialize_coords before forward pass.
+    # Once data is processed, call clear to reset the model before calling
+    # initialize_coords
+    def __init__(self, in_channels, out_channels, D=3):
+        ResNetBase.__init__(self, in_channels, out_channels, D)
+
+    def network_initialization(self, in_channels, out_channels, D):
+        # Output of the first conv concated to conv6
+        self.inplanes = self.INIT_DIM
+        self.conv0p1s1 = ME.MinkowskiConvolution(
+            in_channels, self.inplanes, kernel_size=5, dimension=D)
+
+        self.bn0 = ME.MinkowskiBatchNorm(self.inplanes)
+
+        self.conv1p1s2 = ME.MinkowskiConvolution(
+            self.inplanes, self.inplanes, kernel_size=2, stride=2, dimension=D)
+        self.bn1 = ME.MinkowskiBatchNorm(self.inplanes)
+
+        self.block1 = self._make_layer(self.BLOCK, self.PLANES[0],
+                                       self.LAYERS[0])
+
+        self.conv2p2s2 = ME.MinkowskiConvolution(
+            self.inplanes, self.inplanes, kernel_size=2, stride=2, dimension=D)
+        self.bn2 = ME.MinkowskiBatchNorm(self.inplanes)
+
+        self.block2 = self._make_layer(self.BLOCK, self.PLANES[1],
+                                       self.LAYERS[1])
+
+        self.conv3p4s2 = ME.MinkowskiConvolution(
+            self.inplanes, self.inplanes, kernel_size=2, stride=2, dimension=D)
+
+        self.bn3 = ME.MinkowskiBatchNorm(self.inplanes)
+        self.block3 = self._make_layer(self.BLOCK, self.PLANES[2],
+                                       self.LAYERS[2])
+
+        self.conv4p8s2 = ME.MinkowskiConvolution(
+            self.inplanes, self.inplanes, kernel_size=2, stride=2, dimension=D)
+        self.bn4 = ME.MinkowskiBatchNorm(self.inplanes)
+        self.block4 = self._make_layer(self.BLOCK, self.PLANES[3],
+                                       self.LAYERS[3])
+
+        self.convtr4p16s2 = ME.MinkowskiConvolutionTranspose(
+            self.inplanes, self.PLANES[4], kernel_size=2, stride=2, dimension=D)
+        self.bntr4 = ME.MinkowskiBatchNorm(self.PLANES[4])
+
+        self.inplanes = self.PLANES[4] + self.PLANES[2] * self.BLOCK.expansion
+        self.block5 = self._make_layer(self.BLOCK, self.PLANES[4],
+                                       self.LAYERS[4])
+        self.convtr5p8s2 = ME.MinkowskiConvolutionTranspose(
+            self.inplanes, self.PLANES[5], kernel_size=2, stride=2, dimension=D)
+        self.bntr5 = ME.MinkowskiBatchNorm(self.PLANES[5])
+
+        self.inplanes = self.PLANES[5] + self.PLANES[1] * self.BLOCK.expansion
+        self.block6 = self._make_layer(self.BLOCK, self.PLANES[5],
+                                       self.LAYERS[5])
+        self.convtr6p4s2 = ME.MinkowskiConvolutionTranspose(
+            self.inplanes, self.PLANES[6], kernel_size=2, stride=2, dimension=D)
+        self.bntr6 = ME.MinkowskiBatchNorm(self.PLANES[6])
+
+        self.inplanes = self.PLANES[6] + self.PLANES[0] * self.BLOCK.expansion
+        self.block7 = self._make_layer(self.BLOCK, self.PLANES[6],
+                                       self.LAYERS[6])
+        self.convtr7p2s2 = ME.MinkowskiConvolutionTranspose(
+            self.inplanes, self.PLANES[7], kernel_size=2, stride=2, dimension=D)
+        self.bntr7 = ME.MinkowskiBatchNorm(self.PLANES[7])
+
+        self.inplanes = self.PLANES[7] + self.INIT_DIM
+        self.block8 = self._make_layer(self.BLOCK, self.PLANES[7],
+                                       self.LAYERS[7])
+
+        self.final = ME.MinkowskiConvolution(
+            self.PLANES[7] * self.BLOCK.expansion,
+            out_channels,
+            kernel_size=1,
+            bias=True,
+            dimension=D)
+        self.relu = ME.MinkowskiReLU(inplace=True)
+
+    def forward(self, x):
+        out = self.conv0p1s1(x)
+        out = self.bn0(out)
+        out_p1 = self.relu(out)
+
+        out = self.conv1p1s2(out_p1)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out_b1p2 = self.block1(out)
+
+        out = self.conv2p2s2(out_b1p2)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out_b2p4 = self.block2(out)
+
+        out = self.conv3p4s2(out_b2p4)
+        out = self.bn3(out)
+        out = self.relu(out)
+        out_b3p8 = self.block3(out)
+
+        # tensor_stride=16
+        out = self.conv4p8s2(out_b3p8)
+        out = self.bn4(out)
+        out = self.relu(out)
+        out = self.block4(out)
+
+        # tensor_stride=8
+        out = self.convtr4p16s2(out)
+        out = self.bntr4(out)
+        out = self.relu(out)
+
+        out = ME.cat(out, out_b3p8)
+        out = self.block5(out)
+
+        # tensor_stride=4
+        out = self.convtr5p8s2(out)
+        out = self.bntr5(out)
+        out = self.relu(out)
+
+        out = ME.cat(out, out_b2p4)
+        out = self.block6(out)
+
+        # tensor_stride=2
+        out = self.convtr6p4s2(out)
+        out = self.bntr6(out)
+        out = self.relu(out)
+
+        out = ME.cat(out, out_b1p2)
+        out = self.block7(out)
+
+        # tensor_stride=1
+        out = self.convtr7p2s2(out)
+        out = self.bntr7(out)
+        out = self.relu(out)
+
+        out = ME.cat(out, out_p1)
+        out = self.block8(out)
+
+        return self.final(out)
+
+
+class MinkUNet14(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1, 1, 1, 1, 1)
+
+
+class MinkUNet18(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2, 2, 2, 2, 2)
+
+
+class MinkUNet34(MinkUNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 3, 4, 6, 2, 2, 2, 2)
+
+
+class MinkUNet50(MinkUNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (2, 3, 4, 6, 2, 2, 2, 2)
+
+
+class MinkUNet101(MinkUNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (2, 3, 4, 23, 2, 2, 2, 2)
+
+
+class MinkUNet14A(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 128, 128, 96, 96)
+
+
+class MinkUNet14B(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 128, 128, 128, 128)
+
+
+class MinkUNet14C(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 192, 192, 128, 128)
+
+
+class MinkUNet14Dori(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 384, 384, 384, 384)
+
+
+class MinkUNet14E(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 384, 384, 384, 384)
+
+
+class MinkUNet14D(MinkUNet14):
+    PLANES = (32, 64, 128, 256, 192, 192, 192, 192)
+
+
+class MinkUNet18A(MinkUNet18):
+    PLANES = (32, 64, 128, 256, 128, 128, 96, 96)
+
+
+class MinkUNet18B(MinkUNet18):
+    PLANES = (32, 64, 128, 256, 128, 128, 128, 128)
+
+
+class MinkUNet18D(MinkUNet18):
+    PLANES = (32, 64, 128, 256, 384, 384, 384, 384)
+
+
+class MinkUNet34A(MinkUNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 64, 64)
+
+
+class MinkUNet34B(MinkUNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 64, 32)
+
+
+class MinkUNet34C(MinkUNet34):
+    PLANES = (32, 64, 128, 256, 256, 128, 96, 96)
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/graspnet.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/graspnet.py
@@ -0,0 +1,128 @@
+""" GraspNet baseline model definition.
+    Author: chenxi-wang
+"""
+import logging
+
+# import os
+# import sys
+import numpy as np
+import torch
+import torch.nn as nn
+import MinkowskiEngine as ME
+
+from .backbone_resunet14 import MinkUNet14D
+from .modules import ApproachNet, GraspableNet, CloudCrop, SWADNet
+from ..utils.loss_utils import GRASP_MAX_WIDTH, NUM_VIEW, NUM_ANGLE, NUM_DEPTH, GRASPNESS_THRESHOLD, M_POINT
+from ..utils.label_generation import (process_grasp_labels, match_grasp_view_and_label, batch_viewpoint_params_to_matrix)
+from ...pointnet2.pointnet2_utils import furthest_point_sample, gather_operation
+
+
+class GraspNet(nn.Module):
+    def __init__(self, cylinder_radius=0.05, seed_feat_dim=512, is_training=True):
+        super().__init__()
+        self.is_training = is_training
+        self.seed_feature_dim = seed_feat_dim
+        self.num_depth = NUM_DEPTH
+        self.num_angle = NUM_ANGLE
+        self.M_points = M_POINT
+        self.num_view = NUM_VIEW
+
+        self.backbone = MinkUNet14D(in_channels=3, out_channels=self.seed_feature_dim, D=3)
+        self.graspable = GraspableNet(seed_feature_dim=self.seed_feature_dim)
+        self.rotation = ApproachNet(self.num_view, seed_feature_dim=self.seed_feature_dim, is_training=self.is_training)
+        self.crop = CloudCrop(nsample=16, cylinder_radius=cylinder_radius, seed_feature_dim=self.seed_feature_dim)
+        self.swad = SWADNet(num_angle=self.num_angle, num_depth=self.num_depth)
+
+    def forward(self, end_points):
+        seed_xyz = end_points['point_clouds']  # use all sampled point cloud, B*Ns*3
+        B, point_num, _ = seed_xyz.shape  # batch _size
+        # point-wise features
+        coordinates_batch = end_points['coors']
+        features_batch = end_points['feats']
+        mink_input = ME.SparseTensor(features_batch, coordinates=coordinates_batch)
+        seed_features = self.backbone(mink_input).F
+        seed_features = seed_features[end_points['quantize2original']].view(B, point_num, -1).transpose(1, 2)
+
+        end_points = self.graspable(seed_features, end_points)
+        seed_features_flipped = seed_features.transpose(1, 2)  # B*Ns*feat_dim
+        objectness_score = end_points['objectness_score']
+        graspness_score = end_points['graspness_score'].squeeze(1)
+        objectness_pred = torch.argmax(objectness_score, 1)
+        objectness_mask = (objectness_pred == 1)
+        graspness_mask = graspness_score > GRASPNESS_THRESHOLD
+        graspable_mask = objectness_mask & graspness_mask
+
+        seed_features_graspable = []
+        seed_xyz_graspable = []
+        graspable_num_batch = 0.
+        for i in range(B):
+            cur_mask = graspable_mask[i]
+            graspable_num_batch += cur_mask.sum()
+            cur_feat = seed_features_flipped[i][cur_mask]  # Ns*feat_dim
+            cur_seed_xyz = seed_xyz[i][cur_mask]  # Ns*3  Ns=0->error
+
+            logging.error(f"cur_mask: {cur_mask.shape}")
+            logging.error(f"true num: {torch.sum(cur_mask).item()}")
+            logging.error(f"cur_seed_xyz: {cur_seed_xyz.shape}")
+
+            if torch.sum(cur_mask).item() == 0:
+                return None
+
+            cur_seed_xyz = cur_seed_xyz.unsqueeze(0) # 1*Ns*3
+            fps_idxs = furthest_point_sample(cur_seed_xyz, self.M_points)
+            cur_seed_xyz_flipped = cur_seed_xyz.transpose(1, 2).contiguous()  # 1*3*Ns
+            cur_seed_xyz = gather_operation(cur_seed_xyz_flipped, fps_idxs).transpose(1, 2).squeeze(0).contiguous() # Ns*3
+            cur_feat_flipped = cur_feat.unsqueeze(0).transpose(1, 2).contiguous()  # 1*feat_dim*Ns
+            cur_feat = gather_operation(cur_feat_flipped, fps_idxs).squeeze(0).contiguous() # feat_dim*Ns
+
+            seed_features_graspable.append(cur_feat)
+            seed_xyz_graspable.append(cur_seed_xyz)
+
+        seed_xyz_graspable = torch.stack(seed_xyz_graspable, 0)  # B*Ns*3
+        seed_features_graspable = torch.stack(seed_features_graspable)  # B*feat_dim*Ns
+        end_points['xyz_graspable'] = seed_xyz_graspable
+        end_points['graspable_count_stage1'] = graspable_num_batch / B
+
+        end_points, res_feat = self.rotation(seed_features_graspable, end_points)
+        seed_features_graspable = seed_features_graspable + res_feat
+
+        if self.is_training:
+            end_points = process_grasp_labels(end_points)
+            grasp_top_views_rot, end_points = match_grasp_view_and_label(end_points)
+        else:
+            grasp_top_views_rot = end_points['grasp_top_view_rot']
+
+        group_features = self.crop(seed_xyz_graspable.contiguous(), seed_features_graspable.contiguous(), grasp_top_views_rot)
+        end_points = self.swad(group_features, end_points)
+
+        return end_points
+
+
+def pred_decode(end_points):
+    batch_size = len(end_points['point_clouds'])
+    grasp_preds = []
+    for i in range(batch_size):
+        grasp_center = end_points['xyz_graspable'][i].float()
+
+        grasp_score = end_points['grasp_score_pred'][i].float()
+        grasp_score = grasp_score.view(M_POINT, NUM_ANGLE*NUM_DEPTH)
+        grasp_score, grasp_score_inds = torch.max(grasp_score, -1)  # [M_POINT]
+        grasp_score = grasp_score.view(-1, 1)
+        grasp_angle = (grasp_score_inds // NUM_DEPTH) * np.pi / 12
+        grasp_depth = (grasp_score_inds % NUM_DEPTH + 1) * 0.01
+        grasp_depth = grasp_depth.view(-1, 1)
+        grasp_width = 1.2 * end_points['grasp_width_pred'][i] / 10.
+        grasp_width = grasp_width.view(M_POINT, NUM_ANGLE*NUM_DEPTH)
+        grasp_width = torch.gather(grasp_width, 1, grasp_score_inds.view(-1, 1))
+        grasp_width = torch.clamp(grasp_width, min=0., max=GRASP_MAX_WIDTH)
+
+        approaching = -end_points['grasp_top_view_xyz'][i].float()
+        grasp_rot = batch_viewpoint_params_to_matrix(approaching, grasp_angle)
+        grasp_rot = grasp_rot.view(M_POINT, 9)
+
+        # merge preds
+        grasp_height = 0.02 * torch.ones_like(grasp_score)
+        obj_ids = -1 * torch.ones_like(grasp_score)
+        grasp_preds.append(
+            torch.cat([grasp_score, grasp_width, grasp_height, grasp_depth, grasp_rot, grasp_center, obj_ids], axis=-1))
+    return grasp_preds
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/loss.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/loss.py
@@ -0,0 +1,80 @@
+import torch.nn as nn
+import torch
+
+
+def get_loss(end_points):
+    objectness_loss, end_points = compute_objectness_loss(end_points)
+    graspness_loss, end_points = compute_graspness_loss(end_points)
+    view_loss, end_points = compute_view_graspness_loss(end_points)
+    score_loss, end_points = compute_score_loss(end_points)
+    width_loss, end_points = compute_width_loss(end_points)
+    loss = objectness_loss + 10 * graspness_loss + 100 * view_loss + 15 * score_loss + 10 * width_loss
+    end_points['loss/overall_loss'] = loss
+    return loss, end_points
+
+
+def compute_objectness_loss(end_points):
+    criterion = nn.CrossEntropyLoss(reduction='mean')
+    objectness_score = end_points['objectness_score']
+    objectness_label = end_points['objectness_label']
+    loss = criterion(objectness_score, objectness_label)
+    end_points['loss/stage1_objectness_loss'] = loss
+
+    objectness_pred = torch.argmax(objectness_score, 1)
+    end_points['stage1_objectness_acc'] = (objectness_pred == objectness_label.long()).float().mean()
+    end_points['stage1_objectness_prec'] = (objectness_pred == objectness_label.long())[
+        objectness_pred == 1].float().mean()
+    end_points['stage1_objectness_recall'] = (objectness_pred == objectness_label.long())[
+        objectness_label == 1].float().mean()
+    return loss, end_points
+
+
+def compute_graspness_loss(end_points):
+    criterion = nn.SmoothL1Loss(reduction='none')
+    graspness_score = end_points['graspness_score'].squeeze(1)
+    graspness_label = end_points['graspness_label'].squeeze(-1)
+    loss_mask = end_points['objectness_label'].bool()
+    loss = criterion(graspness_score, graspness_label)
+    loss = loss[loss_mask]
+    loss = loss.mean()
+    
+    graspness_score_c = graspness_score.detach().clone()[loss_mask]
+    graspness_label_c = graspness_label.detach().clone()[loss_mask]
+    graspness_score_c = torch.clamp(graspness_score_c, 0., 0.99)
+    graspness_label_c = torch.clamp(graspness_label_c, 0., 0.99)
+    rank_error = (torch.abs(torch.trunc(graspness_score_c * 20) - torch.trunc(graspness_label_c * 20)) / 20.).mean()
+    end_points['stage1_graspness_acc_rank_error'] = rank_error
+
+    end_points['loss/stage1_graspness_loss'] = loss
+    return loss, end_points
+
+
+def compute_view_graspness_loss(end_points):
+    criterion = nn.SmoothL1Loss(reduction='mean')
+    view_score = end_points['view_score']
+    view_label = end_points['batch_grasp_view_graspness']
+    loss = criterion(view_score, view_label)
+    end_points['loss/stage2_view_loss'] = loss
+    return loss, end_points
+
+
+def compute_score_loss(end_points):
+    criterion = nn.SmoothL1Loss(reduction='mean')
+    grasp_score_pred = end_points['grasp_score_pred']
+    grasp_score_label = end_points['batch_grasp_score']
+    loss = criterion(grasp_score_pred, grasp_score_label)
+
+    end_points['loss/stage3_score_loss'] = loss
+    return loss, end_points
+
+
+def compute_width_loss(end_points):
+    criterion = nn.SmoothL1Loss(reduction='none')
+    grasp_width_pred = end_points['grasp_width_pred']
+    grasp_width_label = end_points['batch_grasp_width'] * 10
+    loss = criterion(grasp_width_pred, grasp_width_label)
+    grasp_score_label = end_points['batch_grasp_score']
+    loss_mask = grasp_score_label > 0
+    loss = loss[loss_mask].mean()
+    end_points['loss/stage3_width_loss'] = loss
+    return loss, end_points
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/modules.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/modules.py
@@ -0,0 +1,116 @@
+# import os
+# import sys
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# ROOT_DIR = os.path.dirname(BASE_DIR)
+# sys.path.append(ROOT_DIR)
+
+from ...pointnet2 import pytorch_utils as pt_utils
+from ...pointnet2.pointnet2_utils import CylinderQueryAndGroup
+from ..utils.loss_utils import generate_grasp_views, batch_viewpoint_params_to_matrix
+
+
+class GraspableNet(nn.Module):
+    def __init__(self, seed_feature_dim):
+        super().__init__()
+        self.in_dim = seed_feature_dim  # default: 512
+        self.conv_graspable = nn.Conv1d(self.in_dim, 3, 1)
+
+    def forward(self, seed_features, end_points):
+        graspable_score = self.conv_graspable(seed_features)  # (B, 3, num_seed)
+        end_points['objectness_score'] = graspable_score[:, :2]  # (B, 3, num_seed)
+        end_points['graspness_score'] = graspable_score[:, 2]  # (B, 3, num_seed)
+        return end_points
+
+
+class ApproachNet(nn.Module):
+    def __init__(self, num_view, seed_feature_dim, is_training=True):
+        super().__init__()
+        self.num_view = num_view
+        self.in_dim = seed_feature_dim
+        self.is_training = is_training
+        self.conv1 = nn.Conv1d(self.in_dim, self.in_dim, 1)
+        self.conv2 = nn.Conv1d(self.in_dim, self.num_view, 1)
+
+    def forward(self, seed_features, end_points):
+        B, _, num_seed = seed_features.size()
+        res_features = F.relu(self.conv1(seed_features), inplace=True)
+        features = self.conv2(res_features)
+        view_score = features.transpose(1, 2).contiguous() # (B, num_seed, num_view)
+        end_points['view_score'] = view_score
+
+        if self.is_training:
+            # normalize view graspness score to 0~1
+            view_score_ = view_score.clone().detach()
+            view_score_max, _ = torch.max(view_score_, dim=2)
+            view_score_min, _ = torch.min(view_score_, dim=2)
+            view_score_max = view_score_max.unsqueeze(-1).expand(-1, -1, self.num_view)
+            view_score_min = view_score_min.unsqueeze(-1).expand(-1, -1, self.num_view)
+            view_score_ = (view_score_ - view_score_min) / (view_score_max - view_score_min + 1e-8)
+
+            top_view_inds = []
+            for i in range(B):
+                top_view_inds_batch = torch.multinomial(view_score_[i], 1, replacement=False)
+                top_view_inds.append(top_view_inds_batch)
+            top_view_inds = torch.stack(top_view_inds, dim=0).squeeze(-1)  # B, num_seed
+        else:
+            _, top_view_inds = torch.max(view_score, dim=2)  # (B, num_seed)
+
+            top_view_inds_ = top_view_inds.view(B, num_seed, 1, 1).expand(-1, -1, -1, 3).contiguous()
+            template_views = generate_grasp_views(self.num_view).to(features.device)  # (num_view, 3)
+            template_views = template_views.view(1, 1, self.num_view, 3).expand(B, num_seed, -1, -1).contiguous()
+            vp_xyz = torch.gather(template_views, 2, top_view_inds_).squeeze(2)  # (B, num_seed, 3)
+            vp_xyz_ = vp_xyz.view(-1, 3)
+            batch_angle = torch.zeros(vp_xyz_.size(0), dtype=vp_xyz.dtype, device=vp_xyz.device)
+            vp_rot = batch_viewpoint_params_to_matrix(-vp_xyz_, batch_angle).view(B, num_seed, 3, 3)
+            end_points['grasp_top_view_xyz'] = vp_xyz
+            end_points['grasp_top_view_rot'] = vp_rot
+
+        end_points['grasp_top_view_inds'] = top_view_inds
+        return end_points, res_features
+
+
+class CloudCrop(nn.Module):
+    def __init__(self, nsample, seed_feature_dim, cylinder_radius=0.05, hmin=-0.02, hmax=0.04):
+        super().__init__()
+        self.nsample = nsample
+        self.in_dim = seed_feature_dim
+        self.cylinder_radius = cylinder_radius
+        mlps = [3 + self.in_dim, 256, 256]   # use xyz, so plus 3
+
+        self.grouper = CylinderQueryAndGroup(radius=cylinder_radius, hmin=hmin, hmax=hmax, nsample=nsample,
+                                             use_xyz=True, normalize_xyz=True)
+        self.mlps = pt_utils.SharedMLP(mlps, bn=True)
+
+    def forward(self, seed_xyz_graspable, seed_features_graspable, vp_rot):
+        grouped_feature = self.grouper(seed_xyz_graspable, seed_xyz_graspable, vp_rot,
+                                       seed_features_graspable)  # B*3 + feat_dim*M*K
+        new_features = self.mlps(grouped_feature)  # (batch_size, mlps[-1], M, K)
+        new_features = F.max_pool2d(new_features, kernel_size=[1, new_features.size(3)])  # (batch_size, mlps[-1], M, 1)
+        new_features = new_features.squeeze(-1)   # (batch_size, mlps[-1], M)
+        return new_features
+
+
+class SWADNet(nn.Module):
+    def __init__(self, num_angle, num_depth):
+        super().__init__()
+        self.num_angle = num_angle
+        self.num_depth = num_depth
+
+        self.conv1 = nn.Conv1d(256, 256, 1)  # input feat dim need to be consistent with CloudCrop module
+        self.conv_swad = nn.Conv1d(256, 2*num_angle*num_depth, 1)
+
+    def forward(self, vp_features, end_points):
+        B, _, num_seed = vp_features.size()
+        vp_features = F.relu(self.conv1(vp_features), inplace=True)
+        vp_features = self.conv_swad(vp_features)
+        vp_features = vp_features.view(B, 2, self.num_angle, self.num_depth, num_seed)
+        vp_features = vp_features.permute(0, 1, 4, 2, 3)
+
+        # split prediction
+        end_points['grasp_score_pred'] = vp_features[:, 0]  # B * num_seed * num angle * num_depth
+        end_points['grasp_width_pred'] = vp_features[:, 1]
+        return end_points
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/models/resnet.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/models/resnet.py
@@ -0,0 +1,196 @@
+import torch.nn as nn
+
+try:
+    import open3d as o3d
+except ImportError:
+    raise ImportError("Please install open3d with `pip install open3d`.")
+
+import MinkowskiEngine as ME
+from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck
+
+
+class ResNetBase(nn.Module):
+    BLOCK = None
+    LAYERS = ()
+    INIT_DIM = 64
+    PLANES = (64, 128, 256, 512)
+
+    def __init__(self, in_channels, out_channels, D=3):
+        nn.Module.__init__(self)
+        self.D = D
+        assert self.BLOCK is not None
+
+        self.network_initialization(in_channels, out_channels, D)
+        self.weight_initialization()
+
+    def network_initialization(self, in_channels, out_channels, D):
+
+        self.inplanes = self.INIT_DIM
+        self.conv1 = nn.Sequential(
+            ME.MinkowskiConvolution(
+                in_channels, self.inplanes, kernel_size=3, stride=2, dimension=D
+            ),
+            ME.MinkowskiInstanceNorm(self.inplanes),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiMaxPooling(kernel_size=2, stride=2, dimension=D),
+        )
+
+        self.layer1 = self._make_layer(
+            self.BLOCK, self.PLANES[0], self.LAYERS[0], stride=2
+        )
+        self.layer2 = self._make_layer(
+            self.BLOCK, self.PLANES[1], self.LAYERS[1], stride=2
+        )
+        self.layer3 = self._make_layer(
+            self.BLOCK, self.PLANES[2], self.LAYERS[2], stride=2
+        )
+        self.layer4 = self._make_layer(
+            self.BLOCK, self.PLANES[3], self.LAYERS[3], stride=2
+        )
+
+        self.conv5 = nn.Sequential(
+            ME.MinkowskiDropout(),
+            ME.MinkowskiConvolution(
+                self.inplanes, self.inplanes, kernel_size=3, stride=3, dimension=D
+            ),
+            ME.MinkowskiInstanceNorm(self.inplanes),
+            ME.MinkowskiGELU(),
+        )
+
+        self.glob_pool = ME.MinkowskiGlobalMaxPooling()
+
+        self.final = ME.MinkowskiLinear(self.inplanes, out_channels, bias=True)
+
+    def weight_initialization(self):
+        for m in self.modules():
+            if isinstance(m, ME.MinkowskiConvolution):
+                ME.utils.kaiming_normal_(m.kernel, mode="fan_out", nonlinearity="relu")
+
+            if isinstance(m, ME.MinkowskiBatchNorm):
+                nn.init.constant_(m.bn.weight, 1)
+                nn.init.constant_(m.bn.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dilation=1, bn_momentum=0.1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                ME.MinkowskiConvolution(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    dimension=self.D,
+                ),
+                ME.MinkowskiBatchNorm(planes * block.expansion),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride=stride,
+                dilation=dilation,
+                downsample=downsample,
+                dimension=self.D,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes, planes, stride=1, dilation=dilation, dimension=self.D
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x: ME.SparseTensor):
+        x = self.conv1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.conv5(x)
+        x = self.glob_pool(x)
+        return self.final(x)
+
+
+class ResNet14(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1)
+
+
+class ResNet18(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2)
+
+
+class ResNet34(ResNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResNet50(ResNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResNet101(ResNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 23, 3)
+
+
+class ResFieldNetBase(ResNetBase):
+    def network_initialization(self, in_channels, out_channels, D):
+        field_ch = 32
+        field_ch2 = 64
+        self.field_network = nn.Sequential(
+            ME.MinkowskiSinusoidal(in_channels, field_ch),
+            ME.MinkowskiBatchNorm(field_ch),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiLinear(field_ch, field_ch),
+            ME.MinkowskiBatchNorm(field_ch),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiToSparseTensor(),
+        )
+        self.field_network2 = nn.Sequential(
+            ME.MinkowskiSinusoidal(field_ch + in_channels, field_ch2),
+            ME.MinkowskiBatchNorm(field_ch2),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiLinear(field_ch2, field_ch2),
+            ME.MinkowskiBatchNorm(field_ch2),
+            ME.MinkowskiReLU(inplace=True),
+            ME.MinkowskiToSparseTensor(),
+        )
+
+        ResNetBase.network_initialization(self, field_ch2, out_channels, D)
+
+    def forward(self, x: ME.TensorField):
+        otensor = self.field_network(x)
+        otensor2 = self.field_network2(otensor.cat_slice(x))
+        return ResNetBase.forward(self, otensor2)
+
+
+class ResFieldNet14(ResFieldNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (1, 1, 1, 1)
+
+
+class ResFieldNet18(ResFieldNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (2, 2, 2, 2)
+
+
+class ResFieldNet34(ResFieldNetBase):
+    BLOCK = BasicBlock
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResFieldNet50(ResFieldNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 6, 3)
+
+
+class ResFieldNet101(ResFieldNetBase):
+    BLOCK = Bottleneck
+    LAYERS = (3, 4, 23, 3)
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/init.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/init.py
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/init.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/init.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/collision_detector.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/collision_detector.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/data_utils.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/data_utils.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/label_generation.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/label_generation.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/loss_utils.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/pycache/loss_utils.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/collision_detector.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/collision_detector.py
@@ -0,0 +1,132 @@
+""" Collision detection to remove collided grasp pose predictions.
+Author: chenxi-wang
+"""
+
+# import os
+# import sys
+import numpy as np
+import open3d as o3d
+
+
+class ModelFreeCollisionDetector():
+    """ Collision detection in scenes without object labels. Current finger width and length are fixed.
+
+        Input:
+                scene_points: [numpy.ndarray, (N,3), numpy.float32]
+                    the scene points to detect
+                voxel_size: [float]
+                    used for downsample
+
+        Example usage:
+            mfcdetector = ModelFreeCollisionDetector(scene_points, voxel_size=0.005)
+            collision_mask = mfcdetector.detect(grasp_group, approach_dist=0.03)
+            collision_mask, iou_list = mfcdetector.detect(grasp_group, approach_dist=0.03, collision_thresh=0.05, return_ious=True)
+            collision_mask, empty_mask = mfcdetector.detect(grasp_group, approach_dist=0.03, collision_thresh=0.05,
+                                            return_empty_grasp=True, empty_thresh=0.01)
+            collision_mask, empty_mask, iou_list = mfcdetector.detect(grasp_group, approach_dist=0.03, collision_thresh=0.05,
+                                            return_empty_grasp=True, empty_thresh=0.01, return_ious=True)
+    """
+    def __init__(self, scene_points, voxel_size=0.005):
+        self.finger_width = 0.005
+        self.finger_length = 0.09
+        self.voxel_size = voxel_size
+        scene_cloud = o3d.geometry.PointCloud()
+        scene_cloud.points = o3d.utility.Vector3dVector(scene_points)
+        scene_cloud = scene_cloud.voxel_down_sample(voxel_size)
+        self.scene_points = np.array(scene_cloud.points) # (K, 3)
+
+    def detect(self, preds, approach_dist=0.03, collision_thresh=0.05, return_empty_grasp=False, empty_thresh=0.01, return_ious=False):
+        """ Detect collision of grasps.
+
+            Input:
+                grasp_group: [GraspGroup, M grasps]
+                    the grasps to check
+                approach_dist: [float]
+                    the distance for a gripper to move along approaching direction before grasping
+                    this shifting space requires no voxel_sizepoint either
+                collision_thresh: [float]
+                    if global collision iou is greater than this threshold,
+                    a collision is detected
+                return_empty_grasp: [bool]
+                    if True, return a mask to imply whether there are objects in a grasp
+                empty_thresh: [float]
+                    if inner space iou is smaller than this threshold,
+                    a collision is detected
+                    only set when [return_empty_grasp] is True
+                return_ious: [bool]
+                    if True, return global collision iou and part collision ious
+                    
+            Output:
+                collision_mask: [numpy.ndarray, (M,), numpy.bool]
+                    True implies collision
+                [optional] empty_mask: [numpy.ndarray, (M,), numpy.bool]
+                    True implies empty grasp
+                    only returned when [return_empty_grasp] is True
+                [optional] iou_list: list of [numpy.ndarray, (M,), numpy.float32]
+                    global and part collision ious, containing
+                    [global_iou, left_iou, right_iou, bottom_iou, shifting_iou]
+                    only returned when [return_ious] is True
+        """
+        approach_dist = max(approach_dist, self.finger_width)
+        T = preds[:, 13:16].reshape(-1, 3) # (N, 3)
+        R = preds[:, 4:13].reshape(-1, 3, 3) # (N, 3, 3)
+        heights = preds[:, 2][:,np.newaxis] # (N,) -> (N, 1)
+        depths = preds[:, 3][:,np.newaxis] # (N,) -> (N, 1)
+        widths = preds[:, 1][:,np.newaxis] # (N,) -> (N, 1)
+        # 使用不同的抓取点偏置
+        targets = self.scene_points[np.newaxis,:,:] - T[:,np.newaxis,:] # (1, K, 3) - (N, 1, 3) = (N, K, 3)
+
+        targets = np.matmul(targets, R) # (N, K, 3) * (N, 3, 3) = (N, K, 3)
+
+        ## collision detection
+        # height mask
+        mask1 = ((targets[:,:,2] > -heights/2) & (targets[:,:,2] < heights/2)) # (N, K) * (N, 3) = (N, K)
+        # left finger mask
+        mask2 = ((targets[:,:,0] > depths - self.finger_length) & (targets[:,:,0] < depths))
+        mask3 = (targets[:,:,1] > -(widths/2 + self.finger_width))
+        mask4 = (targets[:,:,1] < -widths/2)
+        # right finger mask
+        mask5 = (targets[:,:,1] < (widths/2 + self.finger_width))
+        mask6 = (targets[:,:,1] > widths/2)
+        # bottom mask
+        mask7 = ((targets[:,:,0] <= depths - self.finger_length)\
+                & (targets[:,:,0] > depths - self.finger_length - self.finger_width))
+        # shifting mask
+        mask8 = ((targets[:,:,0] <= depths - self.finger_length - self.finger_width)\
+                & (targets[:,:,0] > depths - self.finger_length - self.finger_width - approach_dist))
+
+        # get collision mask of each point
+        left_mask = (mask1 & mask2 & mask3 & mask4)
+        right_mask = (mask1 & mask2 & mask5 & mask6)
+        bottom_mask = (mask1 & mask3 & mask5 & mask7)
+        shifting_mask = (mask1 & mask3 & mask5 & mask8)
+        global_mask = (left_mask | right_mask | bottom_mask | shifting_mask)
+
+        # calculate equivalant volume of each part
+        left_right_volume = (heights * self.finger_length * self.finger_width / (self.voxel_size**3)).reshape(-1)
+        bottom_volume = (heights * (widths+2*self.finger_width) * self.finger_width / (self.voxel_size**3)).reshape(-1)
+        shifting_volume = (heights * (widths+2*self.finger_width) * approach_dist / (self.voxel_size**3)).reshape(-1)
+        volume = left_right_volume*2 + bottom_volume + shifting_volume
+
+        # get collision iou of each part
+        global_iou = global_mask.sum(axis=1) / (volume+1e-6)
+
+        # get collison mask
+        collision_mask = (global_iou > collision_thresh)
+
+        if not (return_empty_grasp or return_ious):
+            return collision_mask
+
+        ret_value = [collision_mask,]
+        if return_empty_grasp:
+            inner_mask = (mask1 & mask2 & (~mask4) & (~mask6))
+            inner_volume = (heights * self.finger_length * widths / (self.voxel_size**3)).reshape(-1)
+            empty_mask = (inner_mask.sum(axis=-1)/inner_volume < empty_thresh)
+            ret_value.append(empty_mask)
+        if return_ious:
+            left_iou = left_mask.sum(axis=1) / (left_right_volume+1e-6)
+            right_iou = right_mask.sum(axis=1) / (left_right_volume+1e-6)
+            bottom_iou = bottom_mask.sum(axis=1) / (bottom_volume+1e-6)
+            shifting_iou = shifting_mask.sum(axis=1) / (shifting_volume+1e-6)
+            ret_value.append([global_iou, left_iou, right_iou, bottom_iou, shifting_iou])
+        return ret_value
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/data_utils.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/data_utils.py
@@ -0,0 +1,156 @@
+""" Tools for data processing.
+    Author: chenxi-wang
+"""
+
+import numpy as np
+
+
+class CameraInfo():
+    """ Camera intrisics for point cloud creation. """
+
+    def __init__(self, width, height, fx, fy, cx, cy, scale):
+        self.width = width
+        self.height = height
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        self.scale = scale
+
+
+def create_point_cloud_from_depth_image(depth, camera, organized=True):
+    """ Generate point cloud using depth image only.
+
+        Input:
+            depth: [numpy.ndarray, (H,W), numpy.float32]
+                depth image
+            camera: [CameraInfo]
+                camera intrinsics
+            organized: bool
+                whether to keep the cloud in image shape (H,W,3)
+
+        Output:
+            cloud: [numpy.ndarray, (H,W,3)/(H*W,3), numpy.float32]
+                generated cloud, (H,W,3) for organized=True, (H*W,3) for organized=False
+    """
+    assert (depth.shape[0] == camera.height and depth.shape[1] == camera.width)
+    xmap = np.arange(camera.width)
+    ymap = np.arange(camera.height)
+    xmap, ymap = np.meshgrid(xmap, ymap)
+    points_z = depth / camera.scale
+    points_x = (xmap - camera.cx) * points_z / camera.fx
+    points_y = (ymap - camera.cy) * points_z / camera.fy
+    cloud = np.stack([points_x, points_y, points_z], axis=-1)
+    if not organized:
+        cloud = cloud.reshape([-1, 3])
+    return cloud
+
+
+def transform_point_cloud(cloud, transform, format='4x4'):
+    """ Transform points to new coordinates with transformation matrix.
+
+        Input:
+            cloud: [np.ndarray, (N,3), np.float32]
+                points in original coordinates
+            transform: [np.ndarray, (3,3)/(3,4)/(4,4), np.float32]
+                transformation matrix, could be rotation only or rotation+translation
+            format: [string, '3x3'/'3x4'/'4x4']
+                the shape of transformation matrix
+                '3x3' --> rotation matrix
+                '3x4'/'4x4' --> rotation matrix + translation matrix
+
+        Output:
+            cloud_transformed: [np.ndarray, (N,3), np.float32]
+                points in new coordinates
+    """
+    if not (format == '3x3' or format == '4x4' or format == '3x4'):
+        raise ValueError('Unknown transformation format, only support \'3x3\' or \'4x4\' or \'3x4\'.')
+    if format == '3x3':
+        cloud_transformed = np.dot(transform, cloud.T).T
+    elif format == '4x4' or format == '3x4':
+        ones = np.ones(cloud.shape[0])[:, np.newaxis]
+        cloud_ = np.concatenate([cloud, ones], axis=1)
+        cloud_transformed = np.dot(transform, cloud_.T).T
+        cloud_transformed = cloud_transformed[:, :3]
+    return cloud_transformed
+
+
+def compute_point_dists(A, B):
+    """ Compute pair-wise point distances in two matrices.
+
+        Input:
+            A: [np.ndarray, (N,3), np.float32]
+                point cloud A
+            B: [np.ndarray, (M,3), np.float32]
+                point cloud B
+
+        Output:
+            dists: [np.ndarray, (N,M), np.float32]
+                distance matrix
+    """
+    A = A[:, np.newaxis, :]
+    B = B[np.newaxis, :, :]
+    dists = np.linalg.norm(A - B, axis=-1)
+    return dists
+
+
+def remove_invisible_grasp_points(cloud, grasp_points, pose, th=0.01):
+    """ Remove invisible part of object model according to scene point cloud.
+
+        Input:
+            cloud: [np.ndarray, (N,3), np.float32]
+                scene point cloud
+            grasp_points: [np.ndarray, (M,3), np.float32]
+                grasp point label in object coordinates
+            pose: [np.ndarray, (4,4), np.float32]
+                transformation matrix from object coordinates to world coordinates
+            th: [float]
+                if the minimum distance between a grasp point and the scene points is greater than outlier, the point will be removed
+
+        Output:
+            visible_mask: [np.ndarray, (M,), np.bool]
+                mask to show the visible part of grasp points
+    """
+    grasp_points_trans = transform_point_cloud(grasp_points, pose)
+    dists = compute_point_dists(grasp_points_trans, cloud)
+    min_dists = dists.min(axis=1)
+    visible_mask = (min_dists < th)
+    return visible_mask
+
+
+def get_workspace_mask(cloud, seg, trans=None, organized=True, outlier=0):
+    """ Keep points in workspace as input.
+
+        Input:
+            cloud: [np.ndarray, (H,W,3), np.float32]
+                scene point cloud
+            seg: [np.ndarray, (H,W,), np.uint8]
+                segmantation label of scene points
+            trans: [np.ndarray, (4,4), np.float32]
+                transformation matrix for scene points, default: None.
+            organized: [bool]
+                whether to keep the cloud in image shape (H,W,3)
+            outlier: [float]
+                if the distance between a point and workspace is greater than outlier, the point will be removed
+                
+        Output:
+            workspace_mask: [np.ndarray, (H,W)/(H*W,), np.bool]
+                mask to indicate whether scene points are in workspace
+    """
+    if organized:
+        h, w, _ = cloud.shape
+        cloud = cloud.reshape([h * w, 3])
+        seg = seg.reshape(h * w)
+    if trans is not None:
+        cloud = transform_point_cloud(cloud, trans)
+    foreground = cloud[seg > 0]
+    xmin, ymin, zmin = foreground.min(axis=0)
+    xmax, ymax, zmax = foreground.max(axis=0)
+    mask_x = ((cloud[:, 0] > xmin - outlier) & (cloud[:, 0] < xmax + outlier))
+    mask_y = ((cloud[:, 1] > ymin - outlier) & (cloud[:, 1] < ymax + outlier))
+    mask_z = ((cloud[:, 2] > zmin - outlier) & (cloud[:, 2] < zmax + outlier))
+    workspace_mask = (mask_x & mask_y & mask_z)
+    if organized:
+        workspace_mask = workspace_mask.reshape([h, w])
+
+    return workspace_mask
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/label_generation.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/label_generation.py
@@ -0,0 +1,143 @@
+""" Dynamically generate grasp labels during training.
+    Author: chenxi-wang
+"""
+
+import os
+import sys
+import torch
+
+# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# ROOT_DIR = os.path.dirname(BASE_DIR)
+# sys.path.append(ROOT_DIR)
+# sys.path.append(os.path.join(ROOT_DIR, 'knn'))
+
+from ...knn.knn_modules import knn
+from ..utils.loss_utils import GRASP_MAX_WIDTH, batch_viewpoint_params_to_matrix, \
+    transform_point_cloud, generate_grasp_views
+
+
+def process_grasp_labels(end_points):
+    """ Process labels according to scene points and object poses. """
+    seed_xyzs = end_points['xyz_graspable']  # (B, M_point, 3)
+    batch_size, num_samples, _ = seed_xyzs.size()
+
+    batch_grasp_points = []
+    batch_grasp_views_rot = []
+    batch_grasp_scores = []
+    batch_grasp_widths = []
+    for i in range(batch_size):
+        seed_xyz = seed_xyzs[i]  # (Ns, 3)
+        poses = end_points['object_poses_list'][i]  # [(3, 4),]
+
+        # get merged grasp points for label computation
+        grasp_points_merged = []
+        grasp_views_rot_merged = []
+        grasp_scores_merged = []
+        grasp_widths_merged = []
+        for obj_idx, pose in enumerate(poses):
+            grasp_points = end_points['grasp_points_list'][i][obj_idx]  # (Np, 3)
+            grasp_scores = end_points['grasp_scores_list'][i][obj_idx]  # (Np, V, A, D)
+            grasp_widths = end_points['grasp_widths_list'][i][obj_idx]  # (Np, V, A, D)
+            _, V, A, D = grasp_scores.size()
+            num_grasp_points = grasp_points.size(0)
+            # generate and transform template grasp views
+            grasp_views = generate_grasp_views(V).to(pose.device)  # (V, 3)
+            grasp_points_trans = transform_point_cloud(grasp_points, pose, '3x4')
+            grasp_views_trans = transform_point_cloud(grasp_views, pose[:3, :3], '3x3')
+            # generate and transform template grasp view rotation
+            angles = torch.zeros(grasp_views.size(0), dtype=grasp_views.dtype, device=grasp_views.device)
+            grasp_views_rot = batch_viewpoint_params_to_matrix(-grasp_views, angles)  # (V, 3, 3)
+            grasp_views_rot_trans = torch.matmul(pose[:3, :3], grasp_views_rot)  # (V, 3, 3)
+
+            # assign views
+            grasp_views_ = grasp_views.transpose(0, 1).contiguous().unsqueeze(0)
+            grasp_views_trans_ = grasp_views_trans.transpose(0, 1).contiguous().unsqueeze(0)
+            view_inds = knn(grasp_views_trans_, grasp_views_, k=1).squeeze() - 1
+            grasp_views_rot_trans = torch.index_select(grasp_views_rot_trans, 0, view_inds)  # (V, 3, 3)
+            grasp_views_rot_trans = grasp_views_rot_trans.unsqueeze(0).expand(num_grasp_points, -1, -1,
+                                                                              -1)  # (Np, V, 3, 3)
+            grasp_scores = torch.index_select(grasp_scores, 1, view_inds)  # (Np, V, A, D)
+            grasp_widths = torch.index_select(grasp_widths, 1, view_inds)  # (Np, V, A, D)
+            # add to list
+            grasp_points_merged.append(grasp_points_trans)
+            grasp_views_rot_merged.append(grasp_views_rot_trans)
+            grasp_scores_merged.append(grasp_scores)
+            grasp_widths_merged.append(grasp_widths)
+
+        grasp_points_merged = torch.cat(grasp_points_merged, dim=0)  # (Np', 3)
+        grasp_views_rot_merged = torch.cat(grasp_views_rot_merged, dim=0)  # (Np', V, 3, 3)
+        grasp_scores_merged = torch.cat(grasp_scores_merged, dim=0)  # (Np', V, A, D)
+        grasp_widths_merged = torch.cat(grasp_widths_merged, dim=0)  # (Np', V, A, D)
+
+        # compute nearest neighbors
+        seed_xyz_ = seed_xyz.transpose(0, 1).contiguous().unsqueeze(0)  # (1, 3, Ns)
+        grasp_points_merged_ = grasp_points_merged.transpose(0, 1).contiguous().unsqueeze(0)  # (1, 3, Np')
+        nn_inds = knn(grasp_points_merged_, seed_xyz_, k=1).squeeze() - 1  # (Ns)
+
+        # assign anchor points to real points
+        grasp_points_merged = torch.index_select(grasp_points_merged, 0, nn_inds)  # (Ns, 3)
+        grasp_views_rot_merged = torch.index_select(grasp_views_rot_merged, 0, nn_inds)  # (Ns, V, 3, 3)
+        grasp_scores_merged = torch.index_select(grasp_scores_merged, 0, nn_inds)  # (Ns, V, A, D)
+        grasp_widths_merged = torch.index_select(grasp_widths_merged, 0, nn_inds)  # (Ns, V, A, D)
+
+        # add to batch
+        batch_grasp_points.append(grasp_points_merged)
+        batch_grasp_views_rot.append(grasp_views_rot_merged)
+        batch_grasp_scores.append(grasp_scores_merged)
+        batch_grasp_widths.append(grasp_widths_merged)
+
+    batch_grasp_points = torch.stack(batch_grasp_points, 0)  # (B, Ns, 3)
+    batch_grasp_views_rot = torch.stack(batch_grasp_views_rot, 0)  # (B, Ns, V, 3, 3)
+    batch_grasp_scores = torch.stack(batch_grasp_scores, 0)  # (B, Ns, V, A, D)
+    batch_grasp_widths = torch.stack(batch_grasp_widths, 0)  # (B, Ns, V, A, D)
+
+    # compute view graspness
+    view_u_threshold = 0.6
+    view_grasp_num = 48
+    batch_grasp_view_valid_mask = (batch_grasp_scores <= view_u_threshold) & (batch_grasp_scores > 0) # (B, Ns, V, A, D)
+    batch_grasp_view_valid = batch_grasp_view_valid_mask.float()
+    batch_grasp_view_graspness = torch.sum(torch.sum(batch_grasp_view_valid, dim=-1), dim=-1) / view_grasp_num  # (B, Ns, V)
+    view_graspness_min, _ = torch.min(batch_grasp_view_graspness, dim=-1)  # (B, Ns)
+    view_graspness_max, _ = torch.max(batch_grasp_view_graspness, dim=-1)
+    view_graspness_max = view_graspness_max.unsqueeze(-1).expand(-1, -1, 300)  # (B, Ns, V)
+    view_graspness_min = view_graspness_min.unsqueeze(-1).expand(-1, -1, 300)  # same shape as batch_grasp_view_graspness
+    batch_grasp_view_graspness = (batch_grasp_view_graspness - view_graspness_min) / (view_graspness_max - view_graspness_min + 1e-5)
+
+    # process scores
+    label_mask = (batch_grasp_scores > 0) & (batch_grasp_widths <= GRASP_MAX_WIDTH)  # (B, Ns, V, A, D)
+    batch_grasp_scores[~label_mask] = 0
+
+    end_points['batch_grasp_point'] = batch_grasp_points
+    end_points['batch_grasp_view_rot'] = batch_grasp_views_rot
+    end_points['batch_grasp_score'] = batch_grasp_scores
+    end_points['batch_grasp_width'] = batch_grasp_widths
+    end_points['batch_grasp_view_graspness'] = batch_grasp_view_graspness
+
+    return end_points
+
+
+def match_grasp_view_and_label(end_points):
+    """ Slice grasp labels according to predicted views. """
+    top_view_inds = end_points['grasp_top_view_inds']  # (B, Ns)
+    template_views_rot = end_points['batch_grasp_view_rot']  # (B, Ns, V, 3, 3)
+    grasp_scores = end_points['batch_grasp_score']  # (B, Ns, V, A, D)
+    grasp_widths = end_points['batch_grasp_width']  # (B, Ns, V, A, D, 3)
+
+    B, Ns, V, A, D = grasp_scores.size()
+    top_view_inds_ = top_view_inds.view(B, Ns, 1, 1, 1).expand(-1, -1, -1, 3, 3)
+    top_template_views_rot = torch.gather(template_views_rot, 2, top_view_inds_).squeeze(2)
+    top_view_inds_ = top_view_inds.view(B, Ns, 1, 1, 1).expand(-1, -1, -1, A, D)
+    top_view_grasp_scores = torch.gather(grasp_scores, 2, top_view_inds_).squeeze(2)
+    top_view_grasp_widths = torch.gather(grasp_widths, 2, top_view_inds_).squeeze(2)
+
+    u_max = top_view_grasp_scores.max()
+    po_mask = top_view_grasp_scores > 0
+    po_mask_num = torch.sum(po_mask)
+    if po_mask_num > 0:
+        u_min = top_view_grasp_scores[po_mask].min()
+        top_view_grasp_scores[po_mask] = torch.log(u_max / top_view_grasp_scores[po_mask]) / (torch.log(u_max / u_min) + 1e-6)
+
+    end_points['batch_grasp_score'] = top_view_grasp_scores  # (B, Ns, A, D)
+    end_points['batch_grasp_width'] = top_view_grasp_widths  # (B, Ns, A, D)
+
+    return top_template_views_rot, end_points
--- a/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/loss_utils.py
+++ b/vision_detect/vision_detect/VisionDetect/net/gsnet/utils/loss_utils.py
@@ -0,0 +1,121 @@
+""" Tools for loss computation.
+    Author: chenxi-wang
+"""
+
+import torch
+import numpy as np
+
+GRASP_MAX_WIDTH = 0.1
+GRASPNESS_THRESHOLD = 0.1
+NUM_VIEW = 300
+NUM_ANGLE = 12
+NUM_DEPTH = 4
+M_POINT = 1024
+
+
+def transform_point_cloud(cloud, transform, format='4x4'):
+    """ Transform points to new coordinates with transformation matrix.
+
+        Input:
+            cloud: [torch.FloatTensor, (N,3)]
+                points in original coordinates
+            transform: [torch.FloatTensor, (3,3)/(3,4)/(4,4)]
+                transformation matrix, could be rotation only or rotation+translation
+            format: [string, '3x3'/'3x4'/'4x4']
+                the shape of transformation matrix
+                '3x3' --> rotation matrix
+                '3x4'/'4x4' --> rotation matrix + translation matrix
+
+        Output:
+            cloud_transformed: [torch.FloatTensor, (N,3)]
+                points in new coordinates
+    """
+    if not (format == '3x3' or format == '4x4' or format == '3x4'):
+        raise ValueError('Unknown transformation format, only support \'3x3\' or \'4x4\' or \'3x4\'.')
+    if format == '3x3':
+        cloud_transformed = torch.matmul(transform, cloud.T).T
+    elif format == '4x4' or format == '3x4':
+        ones = cloud.new_ones(cloud.size(0), device=cloud.device).unsqueeze(-1)
+        cloud_ = torch.cat([cloud, ones], dim=1)
+        cloud_transformed = torch.matmul(transform, cloud_.T).T
+        cloud_transformed = cloud_transformed[:, :3]
+    return cloud_transformed
+
+
+def generate_grasp_views(N=300, phi=(np.sqrt(5) - 1) / 2, center=np.zeros(3), r=1):
+    """ View sampling on a unit sphere using Fibonacci lattices.
+        Ref: https://arxiv.org/abs/0912.4540
+
+        Input:
+            N: [int]
+                number of sampled views
+            phi: [float]
+                constant for view coordinate calculation, different phi's bring different distributions, default: (sqrt(5)-1)/2
+            center: [np.ndarray, (3,), np.float32]
+                sphere center
+            r: [float]
+                sphere radius
+
+        Output:
+            views: [torch.FloatTensor, (N,3)]
+                sampled view coordinates
+    """
+    views = []
+    for i in range(N):
+        zi = (2 * i + 1) / N - 1
+        xi = np.sqrt(1 - zi ** 2) * np.cos(2 * i * np.pi * phi)
+        yi = np.sqrt(1 - zi ** 2) * np.sin(2 * i * np.pi * phi)
+        views.append([xi, yi, zi])
+    views = r * np.array(views) + center
+    return torch.from_numpy(views.astype(np.float32))
+
+
+def batch_viewpoint_params_to_matrix(batch_towards, batch_angle):
+    """ Transform approach vectors and in-plane rotation angles to rotation matrices.
+
+        Input:
+            batch_towards: [torch.FloatTensor, (N,3)]
+                approach vectors in batch
+            batch_angle: [torch.floatTensor, (N,)]
+                in-plane rotation angles in batch
+                
+        Output:
+            batch_matrix: [torch.floatTensor, (N,3,3)]
+                rotation matrices in batch
+    """
+    axis_x = batch_towards
+    ones = torch.ones(axis_x.shape[0], dtype=axis_x.dtype, device=axis_x.device)
+    zeros = torch.zeros(axis_x.shape[0], dtype=axis_x.dtype, device=axis_x.device)
+    axis_y = torch.stack([-axis_x[:, 1], axis_x[:, 0], zeros], dim=-1)
+    mask_y = (torch.norm(axis_y, dim=-1) == 0)
+    axis_y[mask_y, 1] = 1
+    axis_x = axis_x / torch.norm(axis_x, dim=-1, keepdim=True)
+    axis_y = axis_y / torch.norm(axis_y, dim=-1, keepdim=True)
+    axis_z = torch.cross(axis_x, axis_y)
+    sin = torch.sin(batch_angle)
+    cos = torch.cos(batch_angle)
+    R1 = torch.stack([ones, zeros, zeros, zeros, cos, -sin, zeros, sin, cos], dim=-1)
+    R1 = R1.reshape([-1, 3, 3])
+    R2 = torch.stack([axis_x, axis_y, axis_z], dim=-1)
+    batch_matrix = torch.matmul(R2, R1)
+    return batch_matrix
+
+
+def huber_loss(error, delta=1.0):
+    """
+    Args:
+        error: Torch tensor (d1,d2,...,dk)
+    Returns:
+        loss: Torch tensor (d1,d2,...,dk)
+
+    x = error = pred - gt or dist(pred,gt)
+    0.5 * |x|^2                 if |x|<=d
+    0.5 * d^2 + d * (|x|-d)     if |x|>d
+    Author: Charles R. Qi
+    Ref: https://github.com/charlesq34/frustum-pointnets/blob/master/models/model_util.py
+    """
+    abs_error = torch.abs(error)
+    quadratic = torch.clamp(abs_error, max=delta)
+    linear = (abs_error - quadratic)
+    loss = 0.5 * quadratic ** 2 + delta * linear
+    return loss
--- a/vision_detect/vision_detect/VisionDetect/net/knn/init.py
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/init.py
--- a/vision_detect/vision_detect/VisionDetect/net/knn/pycache/init.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/pycache/init.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/knn/pycache/knn_modules.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/pycache/knn_modules.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/knn/knn_modules.py
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/knn_modules.py
@@ -0,0 +1,20 @@
+# import unittest
+# import gc
+# import operator as op
+# import functools
+import torch
+# from torch.autograd import Variable, Function
+# from knn_pytorch import knn_pytorch
+# import knn_pytorch
+from . import knn_pytorch
+
+
+def knn(ref, query, k=1):
+  """ Compute k nearest neighbors for each query point.
+  """
+  device = ref.device
+  ref = ref.float().to(device)
+  query = query.float().to(device)
+  inds = torch.empty(query.shape[0], k, query.shape[2]).long().to(device)
+  knn_pytorch.knn(ref, query, inds)
+  return inds
--- a/vision_detect/vision_detect/VisionDetect/net/knn/knn_pytorch/init.py
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/knn_pytorch/init.py
@@ -0,0 +1 @@
+from .knn_pytorch import *
--- a/vision_detect/vision_detect/VisionDetect/net/knn/setup.py
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/setup.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+
+import glob
+import os
+
+import torch
+from setuptools import find_packages
+from setuptools import setup
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+requirements = ["torch", "torchvision"]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "knn_pytorch.knn_pytorch",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+setup(
+    name="knn_pytorch",
+    version="0.1",
+    author="foolyc",
+    url="https://github.com/foolyc/torchKNN",
+    description="KNN implement in Pytorch 1.0 including both cpu version and gpu version",
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
--- a/vision_detect/vision_detect/VisionDetect/net/knn/src/cpu/knn_cpu.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/src/cpu/knn_cpu.cpp
@@ -0,0 +1,56 @@
+#include "cpu/vision.h"
+
+
+void knn_cpu(float* ref_dev, int ref_width, float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, long* ind_buf)
+{
+    // Compute all the distances
+    for(int query_idx = 0;query_idx<query_width;query_idx++)
+    {
+        for(int ref_idx = 0;ref_idx < ref_width;ref_idx++)
+        {
+            dist_dev[query_idx * ref_width + ref_idx] = 0;
+            for(int hi=0;hi<height;hi++)
+                dist_dev[query_idx * ref_width + ref_idx] += (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]) * (ref_dev[hi * ref_width + ref_idx] - query_dev[hi * query_width + query_idx]);
+        }
+    }
+
+    float temp_value;
+    long temp_idx;
+    // sort the distance and get the index
+    for(int query_idx = 0;query_idx<query_width;query_idx++)
+    {
+        for(int i = 0;i < ref_width;i++)
+        {
+            ind_buf[i] = i+1;
+        }
+        for(int i = 0;i < ref_width;i++)
+            for(int j = 0;j < ref_width - i - 1;j++)
+            {
+                if(dist_dev[query_idx * ref_width + j] > dist_dev[query_idx * ref_width + j + 1])
+                {
+                    temp_value = dist_dev[query_idx * ref_width + j];
+                    dist_dev[query_idx * ref_width + j] = dist_dev[query_idx * ref_width + j + 1];
+                    dist_dev[query_idx * ref_width + j + 1] = temp_value;
+                    temp_idx = ind_buf[j];
+                    ind_buf[j] = ind_buf[j + 1];
+                    ind_buf[j + 1] = temp_idx;
+                }
+
+            }
+
+        for(int i = 0;i < k;i++)
+            ind_dev[query_idx + i * query_width] = ind_buf[i];
+        #if DEBUG
+        for(int i = 0;i < ref_width;i++)
+            printf("%d, ", ind_buf[i]);
+        printf("\n");
+        #endif
+
+    }
+
+
+
+
+
+}
--- a/vision_detect/vision_detect/VisionDetect/net/knn/src/cpu/vision.h
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/src/cpu/vision.h
@@ -0,0 +1,6 @@
+#pragma once
+#include <torch/extension.h>
+
+void knn_cpu(float* ref_dev, int ref_width,
+    float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, long* ind_buf);
--- a/vision_detect/vision_detect/VisionDetect/net/knn/src/cuda/knn.cu
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/src/cuda/knn.cu
@@ -0,0 +1,269 @@
+/** Modifed version of knn-CUDA from https://github.com/vincentfpgarcia/kNN-CUDA
+ * The modifications are
+ *      removed texture memory usage
+ *      removed split query KNN computation
+ *      added feature extraction with bilinear interpolation
+ *
+ * Last modified by Christopher B. Choy <chrischoy@ai.stanford.edu> 12/23/2016
+ */
+
+// Includes
+#include <cstdio>
+#include "cuda.h"
+
+#define IDX2D(i, j, dj) (dj * i + j)
+#define IDX3D(i, j, k, dj, dk) (IDX2D(IDX2D(i, j, dj), k, dk))
+
+#define BLOCK 512
+#define MAX_STREAMS 512
+
+// Constants used by the program
+#define BLOCK_DIM                      16
+#define DEBUG                          0
+
+
+/**
+  * Computes the distance between two matrix A (reference points) and
+  * B (query points) containing respectively wA and wB points.
+  *
+  * @param A     pointer on the matrix A
+  * @param wA    width of the matrix A = number of points in A
+  * @param B     pointer on the matrix B
+  * @param wB    width of the matrix B = number of points in B
+  * @param dim   dimension of points = height of matrices A and B
+  * @param AB    pointer on the matrix containing the wA*wB distances computed
+  */
+__global__ void cuComputeDistanceGlobal( float* A, int wA,
+    float* B, int wB, int dim, float* AB){
+
+// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
+__shared__ float shared_A[BLOCK_DIM][BLOCK_DIM];
+__shared__ float shared_B[BLOCK_DIM][BLOCK_DIM];
+
+
+  // Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
+  __shared__ int begin_A;
+  __shared__ int begin_B;
+  __shared__ int step_A;
+  __shared__ int step_B;
+  __shared__ int end_A;
+
+  // Thread index
+  int tx = threadIdx.x;
+  int ty = threadIdx.y;
+
+  // Other variables
+  float tmp;
+  float ssd = 0;
+
+  // Loop parameters
+  begin_A = BLOCK_DIM * blockIdx.y;
+  begin_B = BLOCK_DIM * blockIdx.x;
+  step_A  = BLOCK_DIM * wA;
+  step_B  = BLOCK_DIM * wB;
+  end_A   = begin_A + (dim-1) * wA;
+
+    // Conditions
+  int cond0 = (begin_A + tx < wA); // used to write in shared memory
+  int cond1 = (begin_B + tx < wB); // used to write in shared memory & to computations and to write in output matrix
+  int cond2 = (begin_A + ty < wA); // used to computations and to write in output matrix
+
+  // Loop over all the sub-matrices of A and B required to compute the block sub-matrix
+  for (int a = begin_A, b = begin_B; a <= end_A; a += step_A, b += step_B) {
+    // Load the matrices from device memory to shared memory; each thread loads one element of each matrix
+    if (a/wA + ty < dim){
+      shared_A[ty][tx] = (cond0)? A[a + wA * ty + tx] : 0;
+      shared_B[ty][tx] = (cond1)? B[b + wB * ty + tx] : 0;
+    }
+    else{
+      shared_A[ty][tx] = 0;
+      shared_B[ty][tx] = 0;
+    }
+
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
+
+    // Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
+    if (cond2 && cond1){
+      for (int k = 0; k < BLOCK_DIM; ++k){
+        tmp = shared_A[k][ty] - shared_B[k][tx];
+        ssd += tmp*tmp;
+      }
+    }
+
+    // Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
+    __syncthreads();
+  }
+
+  // Write the block sub-matrix to device memory; each thread writes one element
+  if (cond2 && cond1)
+    AB[(begin_A + ty) * wB + begin_B + tx] = ssd;
+}
+
+
+/**
+  * Gathers k-th smallest distances for each column of the distance matrix in the top.
+  *
+  * @param dist        distance matrix
+  * @param ind         index matrix
+  * @param width       width of the distance matrix and of the index matrix
+  * @param height      height of the distance matrix and of the index matrix
+  * @param k           number of neighbors to consider
+  */
+__global__ void cuInsertionSort(float *dist, long *ind, int width, int height, int k){
+
+  // Variables
+  int l, i, j;
+  float *p_dist;
+  long  *p_ind;
+  float curr_dist, max_dist;
+  long  curr_row,  max_row;
+  unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (xIndex<width){
+    // Pointer shift, initialization, and max value
+    p_dist   = dist + xIndex;
+    p_ind    = ind  + xIndex;
+    max_dist = p_dist[0];
+    p_ind[0] = 1;
+
+    // Part 1 : sort kth firt elementZ
+    for (l=1; l<k; l++){
+      curr_row  = l * width;
+      curr_dist = p_dist[curr_row];
+      if (curr_dist<max_dist){
+        i=l-1;
+        for (int a=0; a<l-1; a++){
+          if (p_dist[a*width]>curr_dist){
+            i=a;
+            break;
+          }
+        }
+        for (j=l; j>i; j--){
+          p_dist[j*width] = p_dist[(j-1)*width];
+          p_ind[j*width]   = p_ind[(j-1)*width];
+        }
+        p_dist[i*width] = curr_dist;
+        p_ind[i*width]   = l+1;
+      } else {
+        p_ind[l*width] = l+1;
+      }
+      max_dist = p_dist[curr_row];
+    }
+
+    // Part 2 : insert element in the k-th first lines
+    max_row = (k-1)*width;
+    for (l=k; l<height; l++){
+      curr_dist = p_dist[l*width];
+      if (curr_dist<max_dist){
+        i=k-1;
+        for (int a=0; a<k-1; a++){
+          if (p_dist[a*width]>curr_dist){
+            i=a;
+            break;
+          }
+        }
+        for (j=k-1; j>i; j--){
+          p_dist[j*width] = p_dist[(j-1)*width];
+          p_ind[j*width]   = p_ind[(j-1)*width];
+        }
+        p_dist[i*width] = curr_dist;
+        p_ind[i*width]   = l+1;
+        max_dist             = p_dist[max_row];
+      }
+    }
+  }
+}
+
+
+/**
+  * Computes the square root of the first line (width-th first element)
+  * of the distance matrix.
+  *
+  * @param dist    distance matrix
+  * @param width   width of the distance matrix
+  * @param k       number of neighbors to consider
+  */
+__global__ void cuParallelSqrt(float *dist, int width, int k){
+    unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
+  if (xIndex<width && yIndex<k)
+    dist[yIndex*width + xIndex] = sqrt(dist[yIndex*width + xIndex]);
+}
+
+
+//-----------------------------------------------------------------------------------------------//
+//                                   K-th NEAREST NEIGHBORS                                      //
+//-----------------------------------------------------------------------------------------------//
+
+/**
+  * K nearest neighbor algorithm
+  * - Initialize CUDA
+  * - Allocate device memory
+  * - Copy point sets (reference and query points) from host to device memory
+  * - Compute the distances + indexes to the k nearest neighbors for each query point
+  * - Copy distances from device to host memory
+  *
+  * @param ref_host      reference points ; pointer to linear matrix
+  * @param ref_nb        number of reference points ; width of the matrix
+  * @param query_host    query points ; pointer to linear matrix
+  * @param query_nb      number of query points ; width of the matrix
+  * @param dim           dimension of points ; height of the matrices
+  * @param k             number of neighbor to consider
+  * @param dist_host     distances to k nearest neighbors ; pointer to linear matrix
+  * @param dist_host     indexes of the k nearest neighbors ; pointer to linear matrix
+  *
+  */
+void knn_device(float* ref_dev, int ref_nb, float* query_dev, int query_nb,
+    int dim, int k, float* dist_dev, long* ind_dev, cudaStream_t stream){
+
+  // Grids and threads
+  dim3 g_16x16(query_nb/16, ref_nb/16, 1);
+  dim3 t_16x16(16, 16, 1);
+  if (query_nb%16 != 0) g_16x16.x += 1;
+  if (ref_nb  %16 != 0) g_16x16.y += 1;
+  //
+  dim3 g_256x1(query_nb/256, 1, 1);
+  dim3 t_256x1(256, 1, 1);
+  if (query_nb%256 != 0) g_256x1.x += 1;
+
+  dim3 g_k_16x16(query_nb/16, k/16, 1);
+  dim3 t_k_16x16(16, 16, 1);
+  if (query_nb%16 != 0) g_k_16x16.x += 1;
+  if (k  %16 != 0) g_k_16x16.y += 1;
+
+  // Kernel 1: Compute all the distances
+  cuComputeDistanceGlobal<<<g_16x16, t_16x16, 0, stream>>>(ref_dev, ref_nb, query_dev, query_nb, dim, dist_dev);
+
+  // Kernel 2: Sort each column
+  cuInsertionSort<<<g_256x1, t_256x1, 0, stream>>>(dist_dev, ind_dev, query_nb, ref_nb, k);
+
+  // Kernel 3: Compute square root of k first elements
+  // cuParallelSqrt<<<g_k_16x16,t_k_16x16, 0, stream>>>(dist_dev, query_nb, k);
+
+#if DEBUG
+  unsigned int  size_of_float = sizeof(float);
+  unsigned long size_of_long  = sizeof(long);
+
+  float* dist_host = new float[query_nb * k];
+  long*  idx_host  = new long[query_nb * k];
+
+  // Memory copy of output from device to host
+  cudaMemcpy(&dist_host[0], dist_dev,
+      query_nb * k *size_of_float, cudaMemcpyDeviceToHost);
+
+  cudaMemcpy(&idx_host[0], ind_dev,
+      query_nb * k * size_of_long, cudaMemcpyDeviceToHost);
+
+  int i = 0;
+  for(i = 0; i < 100; i++){
+    printf("IDX[%d]: %d\n", i, (int)idx_host[i]);
+  }
+#endif
+}
+
+
+
+
+
+
--- a/vision_detect/vision_detect/VisionDetect/net/knn/src/cuda/vision.h
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/src/cuda/vision.h
@@ -0,0 +1,8 @@
+#pragma once
+#include <torch/extension.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+void knn_device(float* ref_dev, int ref_width,
+    float* query_dev, int query_width,
+    int height, int k, float* dist_dev, long* ind_dev, cudaStream_t stream);
--- a/vision_detect/vision_detect/VisionDetect/net/knn/src/knn.h
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/src/knn.h
@@ -0,0 +1,81 @@
+#pragma once
+#include "cpu/vision.h"
+
+#ifdef WITH_CUDA
+#include "cuda/vision.h"
+//#include <THC/THC.h>
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+//extern THCState *state;
+#endif
+
+
+
+int knn(at::Tensor& ref, at::Tensor& query, at::Tensor& idx)
+{
+
+    // NOTE: check dimensions
+    long batch, ref_nb, query_nb, dim, k;
+    batch = ref.size(0);
+    dim = ref.size(1);
+    k = idx.size(1);
+    ref_nb = ref.size(2);
+    query_nb = query.size(2);
+
+    float *ref_dev = ref.data<float>();
+    float *query_dev = query.data<float>();
+    long *idx_dev = idx.data<long>();
+
+
+
+
+  if (ref.type().is_cuda()) {
+#ifdef WITH_CUDA
+    // NOTE: raise error if not compiled with CUDA
+//    float *dist_dev = (float*)THCudaMalloc(state, ref_nb * query_nb * sizeof(float));
+    float *dist_dev = nullptr;
+    cudaError_t err = cudaMalloc((void**)&dist_dev, ref_nb * query_nb * sizeof(float));
+    if (err != cudaSuccess) {
+        fprintf(stderr, "cudaMalloc failed\n");
+        exit(1);
+    }
+
+    cudaFree(dist_dev);
+
+    for (int b = 0; b < batch; b++)
+    {
+    // knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+    //   dist_dev, idx_dev + b * k * query_nb, THCState_getCurrentStream(state));
+      knn_device(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+      dist_dev, idx_dev + b * k * query_nb, c10::cuda::getCurrentCUDAStream());
+    }
+//    THCudaFree(state, dist_dev);
+    cudaFree(dist_dev);
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        printf("error in knn: %s\n", cudaGetErrorString(err));
+//        THError("aborting");
+        fprintf(stderr, "aborting\n");
+        exit(1);
+    }
+    return 1;
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+
+    float *dist_dev = (float*)malloc(ref_nb * query_nb * sizeof(float));
+    long *ind_buf = (long*)malloc(ref_nb * sizeof(long));
+    for (int b = 0; b < batch; b++) {
+    knn_cpu(ref_dev + b * dim * ref_nb, ref_nb, query_dev + b * dim * query_nb, query_nb, dim, k,
+      dist_dev, idx_dev + b * k * query_nb, ind_buf);
+    }
+
+    free(dist_dev);
+    free(ind_buf);
+
+    return 1;
+
+}
--- a/vision_detect/vision_detect/VisionDetect/net/knn/src/vision.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/knn/src/vision.cpp
@@ -0,0 +1,5 @@
+#include "knn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("knn", &knn, "k-nearest neighbors");
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/init.py
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/init.py
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/pycache/init.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/pycache/init.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/pycache/pointnet2_utils.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/pycache/pointnet2_utils.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/pycache/pytorch_utils.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/pycache/pytorch_utils.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/ball_query.h
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/ball_query.h
@@ -0,0 +1,11 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <torch/extension.h>
+
+
+at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+                      const int nsample);
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/cuda_utils.h
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/cuda_utils.h
@@ -0,0 +1,46 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#ifndef _CUDA_UTILS_H
+#define _CUDA_UTILS_H
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cmath>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+#define TOTAL_THREADS 512
+
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+  return max(min(1 << pow_2, TOTAL_THREADS), 1);
+}
+
+inline dim3 opt_block_config(int x, int y) {
+  const int x_threads = opt_n_threads(x);
+  const int y_threads =
+      max(min(opt_n_threads(y), TOTAL_THREADS / x_threads), 1);
+  dim3 block_config(x_threads, y_threads, 1);
+
+  return block_config;
+}
+
+#define CUDA_CHECK_ERRORS()                                           \
+  do {                                                                \
+    cudaError_t err = cudaGetLastError();                             \
+    if (cudaSuccess != err) {                                         \
+      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",  \
+              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
+              __FILE__);                                              \
+      exit(-1);                                                       \
+    }                                                                 \
+  } while (0)
+
+#endif
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/cylinder_query.h
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/cylinder_query.h
@@ -0,0 +1,7 @@
+// Author: chenxi-wang
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor cylinder_query(at::Tensor new_xyz, at::Tensor xyz, at::Tensor rot, const float radius, const float hmin, const float hmax,
+                      const int nsample);
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/group_points.h
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/group_points.h
@@ -0,0 +1,10 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor group_points(at::Tensor points, at::Tensor idx);
+at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/interpolate.h
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/interpolate.h
@@ -0,0 +1,15 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <torch/extension.h>
+#include <vector>
+
+std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows);
+at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight);
+at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m);
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/sampling.h
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/sampling.h
@@ -0,0 +1,11 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor gather_points(at::Tensor points, at::Tensor idx);
+at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx, const int n);
+at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples);
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/utils.h
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/include/utils.h
@@ -0,0 +1,32 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+#ifdef WITH_CUDA
+#include <ATen/cuda/CUDAContext.h>
+#endif
+#include <torch/extension.h>
+
+#define CHECK_CUDA(x)                                          \
+  do {                                                         \
+    TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor"); \
+  } while (0)
+
+#define CHECK_CONTIGUOUS(x)                                         \
+  do {                                                              \
+    TORCH_CHECK(x.is_contiguous(), #x " must be a contiguous tensor"); \
+  } while (0)
+
+#define CHECK_IS_INT(x)                              \
+  do {                                               \
+    TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, \
+             #x " must be an int tensor");           \
+  } while (0)
+
+#define CHECK_IS_FLOAT(x)                              \
+  do {                                                 \
+    TORCH_CHECK(x.scalar_type() == at::ScalarType::Float, \
+             #x " must be a float tensor");            \
+  } while (0)
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/ball_query.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/ball_query.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+#include<iostream>
+#include "ball_query.h"
+#include "utils.h"
+
+
+#ifdef WITH_CUDA
+void query_ball_point_kernel_wrapper(
+  int b, int n, int m, float radius,
+  int nsample, 
+  const float *new_xyz,
+  const float *xyz, 
+  int *idx
+);
+#endif
+
+void query_ball_point_cpu(
+  int b, int n, int m, float radius,
+  int nsample,
+  const float *new_xyz,
+  const float *xyz, 
+  int *idx
+);
+
+#ifdef DEBUG
+void ball_query_debug(const torch::Tensor& new_xyz, const torch::Tensor& xyz, 
+                      float radius, int nsample) {
+    // 优化后的打印代码，格式更清晰，便于调试
+    std::cout << "===== ball_query.cpp 调试信息 =====" << std::endl;
+    // 打印new_xyz的尺寸（维度）
+    std::cout << "new_xyz.size(): (";
+    for (auto s : new_xyz.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    // 打印xyz的尺寸（维度）
+    std::cout << "xyz.size():    (";
+    for (auto s : xyz.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+
+    // 打印半径和采样数
+    std::cout << "radius:        " << radius << std::endl;
+    std::cout << "nsample:       " << nsample << std::endl;
+    std::cout << "====================================" << std::endl;
+}
+#endif
+
+
+at::Tensor ball_query(at::Tensor new_xyz, at::Tensor xyz, const float radius,
+                      const int nsample) {
+  CHECK_CONTIGUOUS(new_xyz);
+  CHECK_CONTIGUOUS(xyz);
+  CHECK_IS_FLOAT(new_xyz);
+  CHECK_IS_FLOAT(xyz);
+
+  #ifdef WITH_CUDA
+  if (new_xyz.is_cuda()) {
+    CHECK_CUDA(xyz);
+    // 新增1：CUDA张量设备端连续性强校验（解决内存布局碎片化问题）
+    TORCH_CHECK(new_xyz.is_contiguous(), "new_xyz must be contiguous on CUDA device");
+    TORCH_CHECK(xyz.is_contiguous(), "xyz must be contiguous on CUDA device");
+    // 新增2：CUDA设备同步，确保张量内存分配完成（解决指针无效问题）
+    cudaDeviceSynchronize();
+  } 
+  #endif
+
+  #ifdef DEBUG
+  ball_query_debug(new_xyz, xyz, radius, nsample);
+  #endif
+
+  at::Tensor idx =
+      torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
+                   at::device(new_xyz.device()).dtype(at::ScalarType::Int));
+  
+
+  #ifdef WITH_CUDA
+  if (new_xyz.is_cuda()) {
+    query_ball_point_kernel_wrapper(
+      xyz.size(0), 
+      xyz.size(1), 
+      new_xyz.size(1),                      
+      radius, nsample,  
+      new_xyz.contiguous().data_ptr<float>(),
+      xyz.contiguous().data_ptr<float>(), 
+      idx.contiguous().data_ptr<int>()
+    );
+
+  } else {
+  #endif
+    // TORCH_CHECK(false, "CPU not supported");
+    query_ball_point_cpu(
+        xyz.size(0), 
+        xyz.size(1), 
+        new_xyz.size(1), 
+        radius, 
+        nsample,
+        new_xyz.contiguous().data_ptr<float>(), 
+        xyz.contiguous().data_ptr<float>(), 
+        idx.contiguous().data_ptr<int>()
+      );
+    
+  #ifdef WITH_CUDA
+  }
+  #endif
+  
+  return idx;
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/ball_query_cpu.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/ball_query_cpu.cpp
@@ -0,0 +1,44 @@
+
+
+void query_ball_point_cpu(
+    int b, int n, int m, float radius,
+    int nsample,
+    const float* new_xyz,
+    const float* xyz,
+    int* idx
+) {
+    float radius2 = radius * radius;
+
+    for (int batch_index = 0; batch_index < b; ++batch_index) {
+        const float* xyz_b = xyz + batch_index * n * 3;
+        const float* new_xyz_b = new_xyz + batch_index * m * 3;
+        int* idx_b = idx + batch_index * m * nsample;
+
+        for (int j = 0; j < m; ++j) {
+            float nx = new_xyz_b[j * 3 + 0];
+            float ny = new_xyz_b[j * 3 + 1];
+            float nz = new_xyz_b[j * 3 + 2];
+
+            for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
+                float x = xyz_b[k * 3 + 0];
+                float y = xyz_b[k * 3 + 1];
+                float z = xyz_b[k * 3 + 2];
+
+                float d2 =
+                    (nx - x) * (nx - x) +
+                    (ny - y) * (ny - y) +
+                    (nz - z) * (nz - z);
+
+                if (d2 < radius2) {
+                    if (cnt == 0) {
+                        for (int l = 0; l < nsample; ++l) {
+                            idx_b[j * nsample + l] = k;
+                        }
+                    }
+                    idx_b[j * nsample + cnt] = k;
+                    ++cnt;
+                }
+            }
+        }
+    }
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/ball_query_gpu.cu
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/ball_query_gpu.cu
@@ -0,0 +1,59 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: new_xyz(b, m, 3) xyz(b, n, 3)
+// output: idx(b, m, nsample)
+__global__ void query_ball_point_kernel(int b, int n, int m, float radius,
+                                        int nsample,
+                                        const float *__restrict__ new_xyz,
+                                        const float *__restrict__ xyz,
+                                        int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  xyz += batch_index * n * 3;
+  new_xyz += batch_index * m * 3;
+  idx += m * nsample * batch_index;
+
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+
+  float radius2 = radius * radius;
+  for (int j = index; j < m; j += stride) {
+    float new_x = new_xyz[j * 3 + 0];
+    float new_y = new_xyz[j * 3 + 1];
+    float new_z = new_xyz[j * 3 + 2];
+    for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
+      float x = xyz[k * 3 + 0];
+      float y = xyz[k * 3 + 1];
+      float z = xyz[k * 3 + 2];
+      float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+                 (new_z - z) * (new_z - z);
+      if (d2 < radius2) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[j * nsample + l] = k;
+          }
+        }
+        idx[j * nsample + cnt] = k;
+        ++cnt;
+      }
+    }
+  }
+}
+
+void query_ball_point_kernel_wrapper(int b, int n, int m, float radius,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  query_ball_point_kernel<<<b, opt_n_threads(m), 0, stream>>>(
+      b, n, m, radius, nsample, new_xyz, xyz, idx);
+
+  CUDA_CHECK_ERRORS();
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/bindings.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/bindings.cpp
@@ -0,0 +1,27 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "ball_query.h"
+#include "group_points.h"
+#include "interpolate.h"
+#include "sampling.h"
+#include "cylinder_query.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("gather_points", &gather_points);
+  m.def("gather_points_grad", &gather_points_grad);
+  m.def("furthest_point_sampling", &furthest_point_sampling);
+
+  m.def("three_nn", &three_nn);
+  m.def("three_interpolate", &three_interpolate);
+  m.def("three_interpolate_grad", &three_interpolate_grad);
+
+  m.def("ball_query", &ball_query);
+
+  m.def("group_points", &group_points);
+  m.def("group_points_grad", &group_points_grad);
+
+  m.def("cylinder_query", &cylinder_query);
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/cylinder_query.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/cylinder_query.cpp
@@ -0,0 +1,95 @@
+// Author: chenxi-wang
+
+#include "cylinder_query.h"
+#include "utils.h"
+
+
+#ifdef WITH_CUDA
+void query_cylinder_point_kernel_wrapper(int b, int n, int m, float radius, float hmin, float hmax,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, const float *rot, int *idx);
+#endif
+
+
+void query_cylinder_point_cpu(
+    int b, int n, int m, float radius, float hmin, float hmax,
+    int nsample, const float *new_xyz,
+    const float *xyz, const float *rot, int *idx);
+
+#ifdef DEBUG
+void cylinder_query_debug(at::Tensor new_xyz, at::Tensor xyz, at::Tensor rot, 
+                      const float radius, const float hmin, const float hmax,
+                      const int nsample) {
+    // 优化后的打印代码，格式更清晰，便于调试
+    std::cout << "===== cylinder_query.cpp 调试信息 =====" << std::endl;
+    // 打印new_xyz的尺寸（维度）
+    std::cout << "new_xyz.size(): (";
+    for (auto s : new_xyz.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    // 打印xyz的尺寸（维度）
+    std::cout << "xyz.size():    (";
+    for (auto s : xyz.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+
+    // 打印rot的尺寸（维度）
+    std::cout << "rot.size():    (";
+    for (auto s : rot.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+    
+    // 打印半径和采样数
+    std::cout << "radius:        " << radius << std::endl;
+    std::cout << "hmin:          " << hmin << std::endl;
+    std::cout << "hmax:          " << hmax << std::endl;
+    std::cout << "nsample:       " << nsample << std::endl;
+    std::cout << "====================================" << std::endl;
+}
+#endif
+
+at::Tensor cylinder_query(at::Tensor new_xyz, at::Tensor xyz, at::Tensor rot, const float radius, const float hmin, const float hmax,
+                      const int nsample) {
+  CHECK_CONTIGUOUS(new_xyz);
+  CHECK_CONTIGUOUS(xyz);
+  CHECK_CONTIGUOUS(rot);
+  CHECK_IS_FLOAT(new_xyz);
+  CHECK_IS_FLOAT(xyz);
+  CHECK_IS_FLOAT(rot);
+
+  #ifdef WITH_CUDA
+  if (new_xyz.is_cuda()) {
+    CHECK_CUDA(xyz);
+    CHECK_CUDA(rot);
+  }
+  #endif
+
+  #ifdef DEBUG
+  cylinder_query_debug(new_xyz, xyz, rot, radius, hmin, hmax, nsample);
+  #endif
+
+  at::Tensor idx =
+      torch::zeros({new_xyz.size(0), new_xyz.size(1), nsample},
+                   at::device(new_xyz.device()).dtype(at::ScalarType::Int));
+
+  #ifdef WITH_CUDA
+  if (new_xyz.is_cuda()) {
+    query_cylinder_point_kernel_wrapper(xyz.size(0), xyz.size(1), new_xyz.size(1),
+                                    radius, hmin, hmax, nsample, new_xyz.data_ptr<float>(),
+                                    xyz.data_ptr<float>(), rot.data_ptr<float>(), idx.data_ptr<int>());
+  } else {
+  #endif
+    // TORCH_CHECK(false, "CPU not supported");
+    query_cylinder_point_cpu(xyz.size(0), xyz.size(1), new_xyz.size(1),
+                             radius, hmin, hmax, nsample, new_xyz.data_ptr<float>(),
+                             xyz.data_ptr<float>(), rot.data_ptr<float>(), idx.data_ptr<int>());
+  #ifdef WITH_CUDA
+  }
+  #endif
+
+  return idx;
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/cylinder_query_cpu.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/cylinder_query_cpu.cpp
@@ -0,0 +1,56 @@
+void query_cylinder_point_cpu(
+    int b, int n, int m, 
+    float radius, 
+    float hmin, float hmax,
+    int nsample, 
+    const float *new_xyz,
+    const float *xyz, 
+    const float *rot, 
+    int *idx
+) {
+    float radius2 = radius * radius;
+
+    for (int batch_index = 0; batch_index < b; ++batch_index) {
+        const float *xyz_b = xyz + batch_index * n * 3;
+        const float *new_xyz_b = new_xyz + batch_index * m * 3;
+        const float *rot_b = rot + batch_index * m * 9;
+        int *idx_b = idx + batch_index * m * nsample;
+
+        for (int j = 0; j < m; ++j) {
+            float nx = new_xyz_b[j * 3 + 0];
+            float ny = new_xyz_b[j * 3 + 1];
+            float nz = new_xyz_b[j * 3 + 2];
+
+            float r0 = rot_b[j * 9 + 0];
+            float r1 = rot_b[j * 9 + 1];
+            float r2 = rot_b[j * 9 + 2];
+            float r3 = rot_b[j * 9 + 3];
+            float r4 = rot_b[j * 9 + 4];
+            float r5 = rot_b[j * 9 + 5];
+            float r6 = rot_b[j * 9 + 6];
+            float r7 = rot_b[j * 9 + 7];
+            float r8 = rot_b[j * 9 + 8];
+
+            for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
+                float x = xyz_b[k * 3 + 0] - nx;
+                float y = xyz_b[k * 3 + 1] - ny;
+                float z = xyz_b[k * 3 + 2] - nz;
+
+                float x_rot = r0 * x + r3 * y + r6 * z;
+                float y_rot = r1 * x + r4 * y + r7 * z;
+                float z_rot = r2 * x + r5 * y + r8 * z;
+
+                float d2 = y_rot * y_rot + z_rot * z_rot;
+                if (d2 < radius2 && x_rot > hmin && x_rot < hmax) {
+                    if (cnt == 0) {
+                        for (int l = 0; l < nsample; ++l) {
+                            idx_b[j * nsample + l] = k;
+                        }
+                    }
+                    idx_b[j * nsample + cnt] = k;
+                    ++cnt;
+                }
+            }
+        }
+    }
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/cylinder_query_gpu.cu
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/cylinder_query_gpu.cu
@@ -0,0 +1,67 @@
+// Author: chenxi-wang
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+__global__ void query_cylinder_point_kernel(int b, int n, int m, float radius, float hmin, float hmax,
+                                        int nsample,
+                                        const float *__restrict__ new_xyz,
+                                        const float *__restrict__ xyz,
+                                        const float *__restrict__ rot,
+                                        int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  xyz += batch_index * n * 3;
+  new_xyz += batch_index * m * 3;
+  rot += batch_index * m * 9;
+  idx += m * nsample * batch_index;
+
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+
+  float radius2 = radius * radius;
+  for (int j = index; j < m; j += stride) {
+    float new_x = new_xyz[j * 3 + 0];
+    float new_y = new_xyz[j * 3 + 1];
+    float new_z = new_xyz[j * 3 + 2];
+    float r0 = rot[j * 9 + 0];
+    float r1 = rot[j * 9 + 1];
+    float r2 = rot[j * 9 + 2];
+    float r3 = rot[j * 9 + 3];
+    float r4 = rot[j * 9 + 4];
+    float r5 = rot[j * 9 + 5];
+    float r6 = rot[j * 9 + 6];
+    float r7 = rot[j * 9 + 7];
+    float r8 = rot[j * 9 + 8];
+    for (int k = 0, cnt = 0; k < n && cnt < nsample; ++k) {
+      float x = xyz[k * 3 + 0] - new_x;
+      float y = xyz[k * 3 + 1] - new_y;
+      float z = xyz[k * 3 + 2] - new_z;
+      float x_rot = r0 * x + r3 * y + r6 * z;
+      float y_rot = r1 * x + r4 * y + r7 * z;
+      float z_rot = r2 * x + r5 * y + r8 * z;
+      float d2 = y_rot * y_rot + z_rot * z_rot;
+      if (d2 < radius2 && x_rot > hmin && x_rot < hmax) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[j * nsample + l] = k;
+          }
+        }
+        idx[j * nsample + cnt] = k;
+        ++cnt;
+      }
+    }
+  }
+}
+
+void query_cylinder_point_kernel_wrapper(int b, int n, int m, float radius, float hmin, float hmax,
+                                     int nsample, const float *new_xyz,
+                                     const float *xyz, const float *rot, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  query_cylinder_point_kernel<<<b, opt_n_threads(m), 0, stream>>>(
+      b, n, m, radius, hmin, hmax, nsample, new_xyz, xyz, rot, idx);
+
+  CUDA_CHECK_ERRORS();
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/group_points.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/group_points.cpp
@@ -0,0 +1,153 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "group_points.h"
+#include "utils.h"
+
+
+#ifdef WITH_CUDA
+void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
+                                 const float *points, const int *idx,
+                                 float *out);
+
+void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                      int nsample, const float *grad_out,
+                                      const int *idx, float *grad_points);
+#endif
+
+
+void group_points_cpu(int b, int c, int n, int npoints, int nsample,
+                      const float *points, const int *idx,
+                      float *out);
+
+void group_points_grad_cpu(int b, int c, int n, int npoints,
+                          int nsample, const float *grad_out,
+                          const int *idx, float *grad_points);
+
+#ifdef DEBUG
+void group_points_debug(at::Tensor points, at::Tensor idx) {
+    // 优化后的打印代码，格式更清晰，便于调试
+    std::cout << "===== group_points.cpp 调试信息 =====" << std::endl;
+    std::cout << "=====  group points 函数调试信息 =====" << std::endl;
+    // 打印new_xyz的尺寸（维度）
+    std::cout << "points.size(): (";
+    for (auto s : points.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    // 打印xyz的尺寸（维度）
+    std::cout << "idx.size():    (";
+    for (auto s : idx.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+
+    std::cout << "====================================" << std::endl;
+}
+
+void group_points_grad_debug(at::Tensor grad_out, at::Tensor idx, const int n) {
+    // 优化后的打印代码，格式更清晰，便于调试
+    std::cout << "===== group_points.cpp 调试信息 =====" << std::endl;
+    std::cout << "===== group_points_grad 函数调试信息 =====" << std::endl;
+    // 打印new_xyz的尺寸（维度）
+    std::cout << "grad_out.size(): (";
+    for (auto s : grad_out.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    // 打印xyz的尺寸（维度）
+    std::cout << "idx.size():    (";
+    for (auto s : idx.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+    
+    std::cout << "n:             " << n << std::endl;
+    std::cout << "====================================" << std::endl;
+}
+#endif
+
+                          
+
+at::Tensor group_points(at::Tensor points, at::Tensor idx) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+
+  #ifdef WITH_CUDA
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+  #endif
+
+  #ifdef DEBUG
+  group_points_debug(points, idx);
+  #endif
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1), idx.size(2)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  #ifdef WITH_CUDA
+  if (points.is_cuda()) {
+    group_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
+                                idx.size(1), idx.size(2), points.data_ptr<float>(),
+                                idx.data_ptr<int>(), output.data_ptr<float>());
+  } else {
+  #endif
+    // TORCH_CHECK(false, "CPU not supported");
+    group_points_cpu(points.size(0), points.size(1), points.size(2),
+                     idx.size(1), idx.size(2), points.data_ptr<float>(),
+                     idx.data_ptr<int>(), output.data_ptr<float>());
+
+  #ifdef WITH_CUDA
+  }
+  #endif
+
+  return output;
+}
+
+at::Tensor group_points_grad(at::Tensor grad_out, at::Tensor idx, const int n) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+
+  #ifdef WITH_CUDA
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+  #endif
+
+  #ifdef DEBUG
+  group_points_grad_debug(grad_out, idx, n);
+  #endif
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), n},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  #ifdef WITH_CUDA
+  if (grad_out.is_cuda()) {
+    group_points_grad_kernel_wrapper(
+        grad_out.size(0), grad_out.size(1), n, idx.size(1), idx.size(2),
+        grad_out.data_ptr<float>(), idx.data_ptr<int>(), output.data_ptr<float>());
+  } else {
+  #endif
+
+    // TORCH_CHECK(false, "CPU not supported");
+    group_points_grad_cpu(
+        grad_out.size(0), grad_out.size(1), n,idx.size(1), idx.size(2),
+        grad_out.data_ptr<float>(), idx.data_ptr<int>(), output.data_ptr<float>());
+
+  #ifdef WITH_CUDA
+  } 
+  #endif 
+
+  return output;
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/group_points_cpu.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/group_points_cpu.cpp
@@ -0,0 +1,54 @@
+void group_points_cpu(
+    int b, int c, int n, 
+    int npoints, 
+    int nsample,            
+    const float *points, 
+    const int *idx,        
+    float *out   
+) {
+    for (int batch = 0; batch < b; ++batch) {
+        const float *points_b = points + batch * n * c;
+        const int *idx_b = idx + batch * npoints * nsample;
+        float *out_b = out + batch * npoints * nsample * c;
+
+        for (int i = 0; i < c * npoints; ++i) {
+            const int l = i / npoints;
+            const int j = i % npoints;
+            
+            for (int k = 0; k < nsample; ++k) {
+                int ii = idx_b[j * nsample + k];
+                out_b[(l * npoints + j) * nsample + k] = points_b[l * n + ii];
+            }
+        }
+    }
+}
+
+void group_points_grad_cpu(
+    int b, int c, int n, 
+    int npoints,      
+    int nsample, 
+    const float *grad_out,        
+    const int *idx, 
+    float *grad_points
+) {
+    for (int batch_index = 0; batch_index < b; ++batch_index) {
+        const float *grad_out_b = grad_out + batch_index * npoints * nsample * c;
+        const int *idx_b = idx + batch_index * npoints * nsample;
+        float *grad_points_b = grad_points + batch_index * n * c;
+
+        for (int i = 0; i < n * c; ++i) {
+            grad_points_b[i] = 0.0f;
+        }
+
+        for (int i = 0; i < c * npoints; ++i) {
+            const int l = i / npoints;
+            const int j = i % npoints;
+
+            for (int k = 0; k < nsample; ++k) {
+                int ii = idx_b[j * nsample + k];
+                grad_points_b[l * n + ii] += grad_out_b[(l * npoints + j) * nsample + k];
+            }
+        }
+                
+    }
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/group_points_gpu.cu
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/group_points_gpu.cu
@@ -0,0 +1,84 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: points(b, c, n) idx(b, npoints, nsample)
+// output: out(b, c, npoints, nsample)
+__global__ void group_points_kernel(int b, int c, int n, int npoints,
+                                    int nsample,
+                                    const float *__restrict__ points,
+                                    const int *__restrict__ idx,
+                                    float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  points += batch_index * n * c;
+  idx += batch_index * npoints * nsample;
+  out += batch_index * npoints * nsample * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * npoints; i += stride) {
+    const int l = i / npoints;
+    const int j = i % npoints;
+    for (int k = 0; k < nsample; ++k) {
+      int ii = idx[j * nsample + k];
+      if (ii < 0 || ii >= n) {
+        out[(l * npoints + j) * nsample + k] = 0;
+      } else {
+        out[(l * npoints + j) * nsample + k] = points[l * n + ii];
+      }
+    }
+  }
+}
+
+void group_points_kernel_wrapper(int b, int c, int n, int npoints, int nsample,
+                                 const float *points, const int *idx,
+                                 float *out) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  group_points_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
+      b, c, n, npoints, nsample, points, idx, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, npoints, nsample), idx(b, npoints, nsample)
+// output: grad_points(b, c, n)
+__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
+                                         int nsample,
+                                         const float *__restrict__ grad_out,
+                                         const int *__restrict__ idx,
+                                         float *__restrict__ grad_points) {
+  int batch_index = blockIdx.x;
+  grad_out += batch_index * npoints * nsample * c;
+  idx += batch_index * npoints * nsample;
+  grad_points += batch_index * n * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * npoints; i += stride) {
+    const int l = i / npoints;
+    const int j = i % npoints;
+    for (int k = 0; k < nsample; ++k) {
+      int ii = idx[j * nsample + k];
+      atomicAdd(grad_points + l * n + ii,
+                grad_out[(l * npoints + j) * nsample + k]);
+    }
+  }
+}
+
+void group_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                      int nsample, const float *grad_out,
+                                      const int *idx, float *grad_points) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  group_points_grad_kernel<<<b, opt_block_config(npoints, c), 0, stream>>>(
+      b, c, n, npoints, nsample, grad_out, idx, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/interpolate.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/interpolate.cpp
@@ -0,0 +1,241 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "interpolate.h"
+#include "utils.h"
+
+
+#ifdef WITH_CUDA
+void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx);
+void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out);
+void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points);
+#endif
+
+
+void three_nn_cpu(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx);
+void three_interpolate_cpu(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out);
+void three_interpolate_grad_cpu(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points);
+
+#ifdef DEBUG
+void three_nn_debug(at::Tensor unknowns, at::Tensor knows) {
+    std::cout << "===== interpolate.cpp 调试信息 =====" << std::endl;
+    std::cout << "=========== three_nn ===========" << std::endl;
+
+    std::cout << "unknowns.size(): (";
+    for (auto s : unknowns.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    std::cout << "knows.size():    (";
+    for (auto s : knows.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+
+    std::cout << "====================================" << std::endl;
+}
+
+void three_interpolate_debug(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight) {
+    std::cout << "===== interpolate.cpp 调试信息 =====" << std::endl;
+    std::cout << "=========== three_interpolate ===========" << std::endl;
+
+    std::cout << "points.size(): (";
+    for (auto s : points.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    std::cout << "idx.size():    (";
+    for (auto s : idx.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+
+    std::cout << "weight.size():    (";
+    for (auto s : weight.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+
+    std::cout << "====================================" << std::endl;
+}
+
+void three_interpolate_grad_debug(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m) {
+    std::cout << "===== interpolate.cpp 调试信息 =====" << std::endl;
+    std::cout << "=========== three_interpolate_grad ===========" << std::endl;
+
+    std::cout << "grad_out.size(): (";
+    for (auto s : grad_out.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    std::cout << "idx.size():    (";
+    for (auto s : idx.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+
+    std::cout << "weight.size():    (";
+    for (auto s : weight.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+    
+    // 打印半径和采样数
+    std::cout << "m:             " << m << std::endl;
+    std::cout << "====================================" << std::endl;
+}
+#endif
+
+
+std::vector<at::Tensor> three_nn(at::Tensor unknowns, at::Tensor knows) {
+  CHECK_CONTIGUOUS(unknowns);
+  CHECK_CONTIGUOUS(knows);
+  CHECK_IS_FLOAT(unknowns);
+  CHECK_IS_FLOAT(knows);
+
+  #ifdef WITH_CUDA
+  if (unknowns.is_cuda()) {
+    CHECK_CUDA(knows);
+  }
+  #endif
+
+  #ifdef DEBUG
+  three_nn_debug(unknowns, knows);
+  #endif
+
+  at::Tensor idx =
+      torch::zeros({unknowns.size(0), unknowns.size(1), 3},
+                   at::device(unknowns.device()).dtype(at::ScalarType::Int));
+  at::Tensor dist2 =
+      torch::zeros({unknowns.size(0), unknowns.size(1), 3},
+                   at::device(unknowns.device()).dtype(at::ScalarType::Float));
+
+  #ifdef WITH_CUDA
+  if (unknowns.is_cuda()) {
+    three_nn_kernel_wrapper(unknowns.size(0), unknowns.size(1), knows.size(1),
+                            unknowns.data_ptr<float>(), knows.data_ptr<float>(),
+                            dist2.data_ptr<float>(), idx.data_ptr<int>());
+  } else {
+  #endif
+  
+    // TORCH_CHECK(false, "CPU not supported");
+    three_nn_cpu(unknowns.size(0), unknowns.size(1), knows.size(1),
+                 unknowns.data_ptr<float>(), knows.data_ptr<float>(),
+                 dist2.data_ptr<float>(), idx.data_ptr<int>());
+  #ifdef WITH_CUDA
+  }
+  #endif
+
+  return {dist2, idx};
+}
+
+at::Tensor three_interpolate(at::Tensor points, at::Tensor idx,
+                             at::Tensor weight) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(weight);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+  CHECK_IS_FLOAT(weight);
+
+
+  #ifdef WITH_CUDA
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+  }
+  #endif
+
+  #ifdef DEBUG
+  three_interpolate_debug(points, idx, weight);
+  #endif
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  #ifdef WITH_CUDA
+  if (points.is_cuda()) {
+    three_interpolate_kernel_wrapper(
+        points.size(0), points.size(1), points.size(2), idx.size(1),
+        points.data_ptr<float>(), idx.data_ptr<int>(), weight.data_ptr<float>(),
+        output.data_ptr<float>());
+  } else {
+  #endif
+
+    // TORCH_CHECK(false, "CPU not supported");
+    three_interpolate_cpu(
+        points.size(0), points.size(1), points.size(2), idx.size(1),
+        points.data_ptr<float>(), idx.data_ptr<int>(), weight.data_ptr<float>(),
+        output.data_ptr<float>());
+  #ifdef WITH_CUDA
+  }
+  #endif
+
+  return output;
+}
+
+at::Tensor three_interpolate_grad(at::Tensor grad_out, at::Tensor idx,
+                                  at::Tensor weight, const int m) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_CONTIGUOUS(weight);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+  CHECK_IS_FLOAT(weight);
+
+  #ifdef WITH_CUDA
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+    CHECK_CUDA(weight);
+  }
+  #endif
+
+  #ifdef DEBUG
+  three_interpolate_grad_debug(grad_out, idx, weight, m);
+  #endif
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), m},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  #ifdef WITH_CUDA
+  if (grad_out.is_cuda()) {
+    three_interpolate_grad_kernel_wrapper(
+        grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
+        grad_out.data_ptr<float>(), idx.data_ptr<int>(), weight.data_ptr<float>(),
+        output.data_ptr<float>());
+  } else {
+  #endif
+
+    // TORCH_CHECK(false, "CPU not supported");
+    three_interpolate_grad_cpu(
+        grad_out.size(0), grad_out.size(1), grad_out.size(2), m,
+        grad_out.data_ptr<float>(), idx.data_ptr<int>(), weight.data_ptr<float>(),
+        output.data_ptr<float>());
+        
+  #ifdef WITH_CUDA
+  }
+  #endif
+
+  return output;
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/interpolate_cpu.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/interpolate_cpu.cpp
@@ -0,0 +1,123 @@
+void three_nn_cpu(
+    int b, int n, int m, 
+    const float *unknown,           
+    const float *known, 
+    float *dist2, 
+    int *idx
+) {
+    for (int batch_index = 0; batch_index < b; ++batch_index) {
+        const float *unknown_batch = unknown + batch_index * n * 3;
+        const float *known_batch = known + batch_index * m * 3;
+        float *dist2_batch = dist2 + batch_index * n * 3;
+        int *idx_batch = idx + batch_index * n * 3;
+
+        for (int j = 0; j < n; ++j) {
+            float ux = unknown_batch[j * 3 + 0];
+            float uy = unknown_batch[j * 3 + 1];
+            float uz = unknown_batch[j * 3 + 2];
+
+            double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+            int besti1 = 0, besti2 = 0, besti3 = 0;
+
+            for (int k = 0; k < m; ++k) {
+                float x = known_batch[k * 3 + 0];
+                float y = known_batch[k * 3 + 1];
+                float z = known_batch[k * 3 + 2];
+                float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+                if (d < best1) {
+                    best3 = best2;
+                    besti3 = besti2;
+                    best2 = best1;
+                    besti2 = besti1;
+                    best1 = d;
+                    besti1 = k;
+                } else if (d < best2) {
+                    best3 = best2;
+                    besti3 = besti2;
+                    best2 = d;
+                    besti2 = k;
+                } else if (d < best3) {
+                    best3 = d;
+                    besti3 = k;
+                }
+            }
+            dist2_batch[j * 3 + 0] = best1;
+            dist2_batch[j * 3 + 1] = best2;
+            dist2_batch[j * 3 + 2] = best3;
+
+            idx_batch[j * 3 + 0] = besti1;
+            idx_batch[j * 3 + 1] = besti2;
+            idx_batch[j * 3 + 2] = besti3;
+        }
+    }
+}
+
+
+void three_interpolate_cpu(
+    int b, int c, int m, int n,
+    const float *points, 
+    const int *idx,
+    const float *weight, 
+    float *out
+) {
+    for (int batch_index = 0; batch_index < b; ++batch_index) {
+        const float *points_batch = points + batch_index * m * c;
+        const int *idx_batch = idx + batch_index * n * 3;
+        const float *weight_batch = weight + batch_index * n * 3;
+        float *out_batch = out + batch_index * n * c;
+
+        for (int i = 0; i < c * n; ++i) {
+            const int l = i / n;
+            const int j = i % n;
+
+            float w1 = weight_batch[j * 3 + 0];
+            float w2 = weight_batch[j * 3 + 1];
+            float w3 = weight_batch[j * 3 + 2];
+
+            int i1 = idx_batch[j * 3 + 0];
+            int i2 = idx_batch[j * 3 + 1];
+            int i3 = idx_batch[j * 3 + 2];
+
+            out_batch[i] = w1 * points_batch[l * m + i1] +
+                           w2 * points_batch[l * m + i2] +
+                           w3 * points_batch[l * m + i3];
+        }
+    }
+}
+
+
+void three_interpolate_grad_cpu(
+    int b, int c, int n, int m,
+    const float *grad_out,
+    const int *idx, const float *weight,
+    float *grad_points
+) {
+    for (int batch_index = 0; batch_index < b; ++batch_index) {
+        const float *grad_out_batch = grad_out + batch_index * n * c;
+        const int *idx_batch = idx + batch_index * n * 3;
+        const float *weight_batch = weight + batch_index * n * 3;
+        float *grad_points_batch = grad_points + batch_index * m * c;
+
+        // 累积初始值置0
+        for (int i = 0; i < m * c; ++i) {
+            grad_points_batch[i] = 0.0f;
+        }
+
+        for (int i = 0; i < c * n; ++i) {
+            const int l = i / n;
+            const int j = i % n;
+
+            float w1 = weight_batch[j * 3 + 0];
+            float w2 = weight_batch[j * 3 + 1];
+            float w3 = weight_batch[j * 3 + 2];
+
+            int i1 = idx_batch[j * 3 + 0];
+            int i2 = idx_batch[j * 3 + 1];
+            int i3 = idx_batch[j * 3 + 2];
+
+            grad_points_batch[l * m + i1] += w1 * grad_out_batch[i];
+            grad_points_batch[l * m + i2] += w2 * grad_out_batch[i];
+            grad_points_batch[l * m + i3] += w3 * grad_out_batch[i];
+        }
+    }
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/interpolate_gpu.cu
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/interpolate_gpu.cu
@@ -0,0 +1,159 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: unknown(b, n, 3) known(b, m, 3)
+// output: dist2(b, n, 3), idx(b, n, 3)
+__global__ void three_nn_kernel(int b, int n, int m,
+                                const float *__restrict__ unknown,
+                                const float *__restrict__ known,
+                                float *__restrict__ dist2,
+                                int *__restrict__ idx) {
+  int batch_index = blockIdx.x;
+  unknown += batch_index * n * 3;
+  known += batch_index * m * 3;
+  dist2 += batch_index * n * 3;
+  idx += batch_index * n * 3;
+
+  int index = threadIdx.x;
+  int stride = blockDim.x;
+  for (int j = index; j < n; j += stride) {
+    float ux = unknown[j * 3 + 0];
+    float uy = unknown[j * 3 + 1];
+    float uz = unknown[j * 3 + 2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      float x = known[k * 3 + 0];
+      float y = known[k * 3 + 1];
+      float z = known[k * 3 + 2];
+      float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
+    }
+    dist2[j * 3 + 0] = best1;
+    dist2[j * 3 + 1] = best2;
+    dist2[j * 3 + 2] = best3;
+
+    idx[j * 3 + 0] = besti1;
+    idx[j * 3 + 1] = besti2;
+    idx[j * 3 + 2] = besti3;
+  }
+}
+
+void three_nn_kernel_wrapper(int b, int n, int m, const float *unknown,
+                             const float *known, float *dist2, int *idx) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_nn_kernel<<<b, opt_n_threads(n), 0, stream>>>(b, n, m, unknown, known,
+                                                      dist2, idx);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: points(b, c, m), idx(b, n, 3), weight(b, n, 3)
+// output: out(b, c, n)
+__global__ void three_interpolate_kernel(int b, int c, int m, int n,
+                                         const float *__restrict__ points,
+                                         const int *__restrict__ idx,
+                                         const float *__restrict__ weight,
+                                         float *__restrict__ out) {
+  int batch_index = blockIdx.x;
+  points += batch_index * m * c;
+
+  idx += batch_index * n * 3;
+  weight += batch_index * n * 3;
+
+  out += batch_index * n * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weight[j * 3 + 0];
+    float w2 = weight[j * 3 + 1];
+    float w3 = weight[j * 3 + 2];
+
+    int i1 = idx[j * 3 + 0];
+    int i2 = idx[j * 3 + 1];
+    int i3 = idx[j * 3 + 2];
+
+    out[i] = points[l * m + i1] * w1 + points[l * m + i2] * w2 +
+             points[l * m + i3] * w3;
+  }
+}
+
+void three_interpolate_kernel_wrapper(int b, int c, int m, int n,
+                                      const float *points, const int *idx,
+                                      const float *weight, float *out) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_interpolate_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
+      b, c, m, n, points, idx, weight, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, n), idx(b, n, 3), weight(b, n, 3)
+// output: grad_points(b, c, m)
+
+__global__ void three_interpolate_grad_kernel(
+    int b, int c, int n, int m, const float *__restrict__ grad_out,
+    const int *__restrict__ idx, const float *__restrict__ weight,
+    float *__restrict__ grad_points) {
+  int batch_index = blockIdx.x;
+  grad_out += batch_index * n * c;
+  idx += batch_index * n * 3;
+  weight += batch_index * n * 3;
+  grad_points += batch_index * m * c;
+
+  const int index = threadIdx.y * blockDim.x + threadIdx.x;
+  const int stride = blockDim.y * blockDim.x;
+  for (int i = index; i < c * n; i += stride) {
+    const int l = i / n;
+    const int j = i % n;
+    float w1 = weight[j * 3 + 0];
+    float w2 = weight[j * 3 + 1];
+    float w3 = weight[j * 3 + 2];
+
+    int i1 = idx[j * 3 + 0];
+    int i2 = idx[j * 3 + 1];
+    int i3 = idx[j * 3 + 2];
+
+    atomicAdd(grad_points + l * m + i1, grad_out[i] * w1);
+    atomicAdd(grad_points + l * m + i2, grad_out[i] * w2);
+    atomicAdd(grad_points + l * m + i3, grad_out[i] * w3);
+  }
+}
+
+void three_interpolate_grad_kernel_wrapper(int b, int c, int n, int m,
+                                           const float *grad_out,
+                                           const int *idx, const float *weight,
+                                           float *grad_points) {
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  three_interpolate_grad_kernel<<<b, opt_block_config(n, c), 0, stream>>>(
+      b, c, n, m, grad_out, idx, weight, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/sampling.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/sampling.cpp
@@ -0,0 +1,207 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include "sampling.h"
+#include "utils.h"
+
+
+#ifdef WITH_CUDA
+void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
+                                  const float *points, const int *idx,
+                                  float *out);
+void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                       const float *grad_out, const int *idx,
+                                       float *grad_points);
+
+void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
+                                            const float *dataset, float *temp,
+                                            int *idxs);
+#endif
+
+
+void gather_points_cpu(int b, int c, int n, int npoints, 
+                        const float *points, const int *idx,
+                        float *out);
+void gather_points_grad_cpu(int b, int c, int n, int npoints,
+                            const float *grad_out, const int *idx,
+                            float *grad_points);
+
+void furthest_point_sampling_cpu(int b, int n, int m,
+                                  const float *dataset, float *temp,
+                                  int *idxs);
+
+
+#ifdef DEBUG
+void gather_points_debug(at::Tensor points, at::Tensor idx) {
+    std::cout << "===== sampling.cpp 调试信息 =====" << std::endl;
+    std::cout << "=========== gather_points ===========" << std::endl;
+
+    std::cout << "points.size(): (";
+    for (auto s : points.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    std::cout << "idx.size():    (";
+    for (auto s : idx.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+
+    std::cout << "====================================" << std::endl;
+}
+
+void gather_points_grad_debug(at::Tensor grad_out, at::Tensor idx,
+                              const int n) {
+    std::cout << "===== sampling.cpp 调试信息 =====" << std::endl;
+    std::cout << "=========== gather_points_grad ===========" << std::endl;
+
+    std::cout << "grad_out.size(): (";
+    for (auto s : grad_out.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    std::cout << "idx.size():    (";
+    for (auto s : idx.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" << std::endl;
+    
+    std::cout << "n:             " << n << std::endl;
+    std::cout << "====================================" << std::endl;
+}
+
+void furthest_point_sampling_debug(at::Tensor points, const int nsamples) {
+    std::cout << "===== sampling.cpp 调试信息 =====" << std::endl;
+    std::cout << "=========== furthest_point_sampling ===========" << std::endl;
+
+    std::cout << "points.size(): (";
+    for (auto s : points.sizes()) {
+        std::cout << s << " ";
+    }
+    std::cout << ")" <<std::endl;
+    
+    std::cout << "nsamples:      " << nsamples << std::endl;
+    std::cout << "====================================" << std::endl;
+}
+#endif
+
+
+at::Tensor gather_points(at::Tensor points, at::Tensor idx) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(points);
+  CHECK_IS_INT(idx);
+
+  #ifdef WITH_CUDA
+  if (points.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+  #endif
+
+  #ifdef DEBUG
+  gather_points_debug(points, idx);
+  #endif
+
+  at::Tensor output =
+      torch::zeros({points.size(0), points.size(1), idx.size(1)},
+                   at::device(points.device()).dtype(at::ScalarType::Float));
+
+  #ifdef WITH_CUDA
+  if (points.is_cuda()) {
+    gather_points_kernel_wrapper(points.size(0), points.size(1), points.size(2),
+                                 idx.size(1), points.data_ptr<float>(),
+                                 idx.data_ptr<int>(), output.data_ptr<float>());
+  } else {
+  #endif
+  
+    // TORCH_CHECK(false, "CPU not supported");
+    gather_points_cpu(points.size(0), points.size(1), points.size(2),
+                      idx.size(1), points.data_ptr<float>(),
+                      idx.data_ptr<int>(), output.data_ptr<float>());
+  #ifdef WITH_CUDA
+  }
+  #endif
+
+  return output;
+}
+
+at::Tensor gather_points_grad(at::Tensor grad_out, at::Tensor idx,
+                              const int n) {
+  CHECK_CONTIGUOUS(grad_out);
+  CHECK_CONTIGUOUS(idx);
+  CHECK_IS_FLOAT(grad_out);
+  CHECK_IS_INT(idx);
+
+  #ifdef WITH_CUDA
+  if (grad_out.is_cuda()) {
+    CHECK_CUDA(idx);
+  }
+  #endif
+
+  #ifdef DEBUG
+  gather_points_grad_debug(grad_out, idx, n);
+  #endif
+
+  at::Tensor output =
+      torch::zeros({grad_out.size(0), grad_out.size(1), n},
+                   at::device(grad_out.device()).dtype(at::ScalarType::Float));
+
+  #ifdef WITH_CUDA
+  if (grad_out.is_cuda()) {
+    gather_points_grad_kernel_wrapper(grad_out.size(0), grad_out.size(1), n,
+                                      idx.size(1), grad_out.data_ptr<float>(),
+                                      idx.data_ptr<int>(), output.data_ptr<float>());
+  } else {
+  #endif
+
+    // TORCH_CHECK(false, "CPU not supported");
+    gather_points_grad_cpu(grad_out.size(0), grad_out.size(1), n,
+                           idx.size(1), grad_out.data_ptr<float>(),
+                           idx.data_ptr<int>(), output.data_ptr<float>());
+
+  #ifdef WITH_CUDA
+  }
+  #endif
+
+  return output;
+}
+
+at::Tensor furthest_point_sampling(at::Tensor points, const int nsamples) {
+  CHECK_CONTIGUOUS(points);
+  CHECK_IS_FLOAT(points);
+
+  #ifdef DEBUG
+  furthest_point_sampling_debug(points, nsamples);
+  #endif
+
+  at::Tensor output =
+      torch::zeros({points.size(0), nsamples},
+                   at::device(points.device()).dtype(at::ScalarType::Int));
+
+  at::Tensor tmp =
+      torch::full({points.size(0), points.size(1)}, 1e10,
+                  at::device(points.device()).dtype(at::ScalarType::Float));
+
+  #ifdef WITH_CUDA
+  if (points.is_cuda()) {
+    furthest_point_sampling_kernel_wrapper(
+        points.size(0), points.size(1), nsamples, points.data_ptr<float>(),
+        tmp.data_ptr<float>(), output.data_ptr<int>());
+  } else {
+  #endif
+
+    // TORCH_CHECK(false, "CPU not supported");
+    furthest_point_sampling_cpu(
+        points.size(0), points.size(1), nsamples, points.data_ptr<float>(), 
+        tmp.data_ptr<float>(), output.data_ptr<int>());
+        
+  #ifdef WITH_CUDA
+  }
+  #endif
+  
+  return output;
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/sampling_cpu.cpp
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/sampling_cpu.cpp
@@ -0,0 +1,98 @@
+void gather_points_cpu(
+    int b, int c, int n, int npoints, 
+    const float *points, 
+    const int *idx,
+    float *out
+) {
+    for (int i = 0; i < b; ++i) {
+        for (int l = 0; l < c; ++l) {
+            for (int j = 0; j < npoints; ++j) {
+                int a = idx[i * npoints + j];
+                out[(i * c + l) * npoints + j] = points[(i * c + l) * n + a];
+            }
+        }
+    }
+}
+void gather_points_grad_cpu(
+    int b, int c, int n, int npoints,
+    const float *grad_out, 
+    const int *idx,
+    float *grad_points
+) {
+    for (int i = 0; i < b; ++i) {
+        for (int l = 0; l < c; ++l) {
+            for (int j = 0; j < npoints; ++j) {
+                int a = idx[i * npoints + j];
+                grad_points[(i * c + l) * n + a] += grad_out[(i * c + l) * npoints + j];
+            }
+        }
+    }
+}
+
+void furthest_point_sampling_cpu(
+    int b, int n, int m,
+    const float *dataset, 
+    float *temp,
+    int *idxs
+) {
+    // 1. 遍历每个批次（对应CUDA的blockIdx.x）
+    for (int batch_index = 0; batch_index < b; ++batch_index) {
+        const float *dataset_batch = dataset + batch_index * n * 3;
+        float *temp_batch = temp + batch_index * n;
+        int *idxs_batch = idxs + batch_index * m;
+
+        // 2. 初始化：第一个采样点为索引0（对应CUDA的threadIdx.x==0时的赋值）
+        int old = 0;
+        idxs_batch[0] = old;
+
+        // 3. 初始化cur_temp：设为极大值（表示初始时与已选点集（仅一个点）的距离未计算）
+        for (int k = 0; k < n; ++k) {
+            temp_batch[k] = 1e10f; // 极大值，保证首次min操作有效
+        }
+
+        // 4. 迭代m-1次，选择剩余m-1个采样点（对应CUDA的for (int j=1; j<m; j++)）
+        for (int j = 1; j < m; ++j) {
+            // 4.1 提取上一个选中点的坐标（x1,y1,z1）
+            float x1 = dataset_batch[old * 3 + 0];
+            float y1 = dataset_batch[old * 3 + 1];
+            float z1 = dataset_batch[old * 3 + 2];
+
+            // 4.2 遍历所有点，计算距离并更新cur_temp（替换CUDA的线程步长循环）
+            for (int k = 0; k < n; ++k) {
+                // 提取当前点的坐标（x2,y2,z2）
+                float x2 = dataset_batch[k * 3 + 0];
+                float y2 = dataset_batch[k * 3 + 1];
+                float z2 = dataset_batch[k * 3 + 2];
+
+                // 过滤无效点（模长过小，近似为原点，对应CUDA的mag<=1e-3判断）
+                float mag = x2 * x2 + y2 * y2 + z2 * z2;
+                if (mag <= 1e-3f) continue;
+
+                // 计算当前点到上一个选中点的欧氏距离平方（避免开方，不影响大小比较）
+                float dx = x2 - x1;
+                float dy = y2 - y1;
+                float dz = z2 - z1;
+                float d = dx * dx + dy * dy + dz * dz;
+
+                // 核心：更新当前点到已选点集的最小距离
+                if (d < temp_batch[k]) {
+                    temp_batch[k] = d;
+                }
+            }
+
+            // 4.3 规约操作：找到cur_temp中最大值对应的点索引（替换CUDA的__update+共享内存规约）
+            float best = -1.0f;
+            int besti = 0;
+            for (int k = 0; k < n; ++k) {
+                if (temp_batch[k] > best) {
+                    best = temp_batch[k];
+                    besti = k;
+                }
+            }
+
+            // 4.4 记录当前选中点，更新old为下一轮的上一个选中点
+            old = besti;
+            idxs_batch[j] = old;
+        }
+    }
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/sampling_gpu.cu
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/_ext_src/src/sampling_gpu.cu
@@ -0,0 +1,234 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// 
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "cuda_utils.h"
+
+// input: points(b, c, n) idx(b, m)
+// output: out(b, c, m)
+__global__ void gather_points_kernel(int b, int c, int n, int m,
+                                     const float *__restrict__ points,
+                                     const int *__restrict__ idx,
+                                     float *__restrict__ out) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int l = blockIdx.y; l < c; l += gridDim.y) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        int a = idx[i * m + j];
+        out[(i * c + l) * m + j] = points[(i * c + l) * n + a];
+      }
+    }
+  }
+}
+
+void gather_points_kernel_wrapper(int b, int c, int n, int npoints,
+                                  const float *points, const int *idx,
+                                  float *out) {
+  gather_points_kernel<<<dim3(b, c, 1), opt_n_threads(npoints), 0,
+                         at::cuda::getCurrentCUDAStream()>>>(b, c, n, npoints,
+                                                             points, idx, out);
+
+  CUDA_CHECK_ERRORS();
+}
+
+// input: grad_out(b, c, m) idx(b, m)
+// output: grad_points(b, c, n)
+__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
+                                          const float *__restrict__ grad_out,
+                                          const int *__restrict__ idx,
+                                          float *__restrict__ grad_points) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int l = blockIdx.y; l < c; l += gridDim.y) {
+      for (int j = threadIdx.x; j < m; j += blockDim.x) {
+        int a = idx[i * m + j];
+        atomicAdd(grad_points + (i * c + l) * n + a,
+                  grad_out[(i * c + l) * m + j]);
+      }
+    }
+  }
+}
+
+void gather_points_grad_kernel_wrapper(int b, int c, int n, int npoints,
+                                       const float *grad_out, const int *idx,
+                                       float *grad_points) {
+  gather_points_grad_kernel<<<dim3(b, c, 1), opt_n_threads(npoints), 0,
+                              at::cuda::getCurrentCUDAStream()>>>(
+      b, c, n, npoints, grad_out, idx, grad_points);
+
+  CUDA_CHECK_ERRORS();
+}
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+// Input dataset: (b, n, 3), tmp: (b, n)
+// Ouput idxs (b, m)
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      if (mag <= 1e-3) continue;
+
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+    if (block_size >= 512) {
+      if (tid < 256) {
+        __update(dists, dists_i, tid, tid + 256);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 256) {
+      if (tid < 128) {
+        __update(dists, dists_i, tid, tid + 128);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 128) {
+      if (tid < 64) {
+        __update(dists, dists_i, tid, tid + 64);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 64) {
+      if (tid < 32) {
+        __update(dists, dists_i, tid, tid + 32);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 32) {
+      if (tid < 16) {
+        __update(dists, dists_i, tid, tid + 16);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 16) {
+      if (tid < 8) {
+        __update(dists, dists_i, tid, tid + 8);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 8) {
+      if (tid < 4) {
+        __update(dists, dists_i, tid, tid + 4);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 4) {
+      if (tid < 2) {
+        __update(dists, dists_i, tid, tid + 2);
+      }
+      __syncthreads();
+    }
+    if (block_size >= 2) {
+      if (tid < 1) {
+        __update(dists, dists_i, tid, tid + 1);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+void furthest_point_sampling_kernel_wrapper(int b, int n, int m,
+                                            const float *dataset, float *temp,
+                                            int *idxs) {
+  unsigned int n_threads = opt_n_threads(n);
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  switch (n_threads) {
+    case 512:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  CUDA_CHECK_ERRORS();
+}
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/pointnet2/init.py
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/pointnet2/init.py
@@ -0,0 +1 @@
+from .pointnet2 import *
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/pointnet2/pycache/init.cpython-310.pyc
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/pointnet2/pycache/init.cpython-310.pyc
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/pointnet2_modules.py
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/pointnet2_modules.py
@@ -0,0 +1,518 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+''' Pointnet2 layers.
+Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch
+Extended with the following:
+1. Uniform sampling in each local region (sample_uniformly)
+2. Return sampled points indices to support votenet.
+'''
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# import os
+# import sys
+# BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# sys.path.append(BASE_DIR)
+
+from . import pointnet2_utils
+from . import pytorch_utils as pt_utils
+from typing import List
+
+
+class _PointnetSAModuleBase(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.npoint = None
+        self.groupers = None
+        self.mlps = None
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, N, C) tensor of the descriptors of the the features
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, npoint, \sum_k(mlps[k][-1])) tensor of the new_features descriptors
+        """
+
+        new_features_list = []
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped,
+            pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], npoint, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1)
+
+
+class PointnetSAModuleMSG(_PointnetSAModuleBase):
+    r"""Pointnet set abstrction layer with multiscale grouping
+
+    Parameters
+    ----------
+    npoint : int
+        Number of features
+    radii : list of float32
+        list of radii to group with
+    nsamples : list of int32
+        Number of samples in each ball query
+    mlps : list of list of int32
+        Spec of the pointnet before the global max_pool for each scale
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(
+            self,
+            *,
+            npoint: int,
+            radii: List[float],
+            nsamples: List[int],
+            mlps: List[List[int]],
+            bn: bool = True,
+            use_xyz: bool = True, 
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert len(radii) == len(nsamples) == len(mlps)
+
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly)
+                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+
+class PointnetSAModule(PointnetSAModuleMSG):
+    r"""Pointnet set abstrction layer
+
+    Parameters
+    ----------
+    npoint : int
+        Number of features
+    radius : float
+        Radius of ball
+    nsample : int
+        Number of samples in the ball query
+    mlp : list
+        Spec of the pointnet before the global max_pool
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(
+            self,
+            *,
+            mlp: List[int],
+            npoint: int = None,
+            radius: float = None,
+            nsample: int = None,
+            bn: bool = True,
+            use_xyz: bool = True
+    ):
+        super().__init__(
+            mlps=[mlp],
+            npoint=npoint,
+            radii=[radius],
+            nsamples=[nsample],
+            bn=bn,
+            use_xyz=use_xyz
+        )
+
+
+class PointnetSAModuleVotes(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    with extra support for returning point indices for getting their GT votes '''
+
+    def __init__(
+            self,
+            *,
+            mlp: List[int],
+            npoint: int = None,
+            radius: float = None,
+            nsample: int = None,
+            bn: bool = True,
+            use_xyz: bool = True,
+            pooling: str = 'max',
+            sigma: float = None, # for RBF pooling
+            normalize_xyz: bool = False, # noramlize local XYZ with radius
+            sample_uniformly: bool = False,
+            ret_unique_cnt: bool = False
+    ):
+        super().__init__()
+
+        self.npoint = npoint
+        self.radius = radius
+        self.nsample = nsample
+        self.pooling = pooling
+        self.mlp_module = None
+        self.use_xyz = use_xyz
+        self.sigma = sigma
+        if self.sigma is None:
+            self.sigma = self.radius/2
+        self.normalize_xyz = normalize_xyz
+        self.ret_unique_cnt = ret_unique_cnt
+
+        if npoint is not None:
+            self.grouper = pointnet2_utils.QueryAndGroup(radius, nsample,
+                use_xyz=use_xyz, ret_grouped_xyz=True, normalize_xyz=normalize_xyz,
+                sample_uniformly=sample_uniformly, ret_unique_cnt=ret_unique_cnt)
+        else:
+            self.grouper = pointnet2_utils.GroupAll(use_xyz, ret_grouped_xyz=True)
+
+        mlp_spec = mlp
+        if use_xyz and len(mlp_spec)>0:
+            mlp_spec[0] += 3
+        self.mlp_module = pt_utils.SharedMLP(mlp_spec, bn=bn)
+
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None,
+                inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, C, N) tensor of the descriptors of the the features
+        inds : torch.Tensor
+            (B, npoint) tensor that stores index to the xyz points (values in 0-N-1)
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors
+        inds: torch.Tensor
+            (B, npoint) tensor of the inds
+        """
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        if inds is None:
+            inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        else:
+            assert(inds.shape[1] == self.npoint)
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped, inds
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        if not self.ret_unique_cnt:
+            grouped_features, grouped_xyz = self.grouper(
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+        else:
+            grouped_features, grouped_xyz, unique_cnt = self.grouper(
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample), (B,3,npoint,nsample), (B,npoint)
+
+        new_features = self.mlp_module(
+            grouped_features
+        )  # (B, mlp[-1], npoint, nsample)
+        if self.pooling == 'max':
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+        elif self.pooling == 'avg':
+            new_features = F.avg_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+        elif self.pooling == 'rbf': 
+            # Use radial basis function kernel for weighted sum of features (normalized by nsample and sigma)
+            # Ref: https://en.wikipedia.org/wiki/Radial_basis_function_kernel
+            rbf = torch.exp(-1 * grouped_xyz.pow(2).sum(1,keepdim=False) / (self.sigma**2) / 2) # (B, npoint, nsample)
+            new_features = torch.sum(new_features * rbf.unsqueeze(1), -1, keepdim=True) / float(self.nsample) # (B, mlp[-1], npoint, 1)
+        new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+        if not self.ret_unique_cnt:
+            return new_xyz, new_features, inds
+        else:
+            return new_xyz, new_features, inds, unique_cnt
+
+class PointnetSAModuleMSGVotes(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    with extra support for returning point indices for getting their GT votes '''
+
+    def __init__(
+            self,
+            *,
+            mlps: List[List[int]],
+            npoint: int,
+            radii: List[float],
+            nsamples: List[int],
+            bn: bool = True,
+            use_xyz: bool = True,
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert(len(mlps) == len(nsamples) == len(radii))
+
+        self.npoint = npoint
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz, sample_uniformly=sample_uniformly)
+                if npoint is not None else pointnet2_utils.GroupAll(use_xyz)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+    def forward(self, xyz: torch.Tensor,
+                features: torch.Tensor = None, inds: torch.Tensor = None) -> (torch.Tensor, torch.Tensor):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor of the xyz coordinates of the features
+        features : torch.Tensor
+            (B, C, C) tensor of the descriptors of the the features
+        inds : torch.Tensor
+            (B, npoint) tensor that stores index to the xyz points (values in 0-N-1)
+
+        Returns
+        -------
+        new_xyz : torch.Tensor
+            (B, npoint, 3) tensor of the new features' xyz
+        new_features : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), npoint) tensor of the new_features descriptors
+        inds: torch.Tensor
+            (B, npoint) tensor of the inds
+        """
+        new_features_list = []
+
+        xyz_flipped = xyz.transpose(1, 2).contiguous()
+        if inds is None:
+            inds = pointnet2_utils.furthest_point_sample(xyz, self.npoint)
+        new_xyz = pointnet2_utils.gather_operation(
+            xyz_flipped, inds
+        ).transpose(1, 2).contiguous() if self.npoint is not None else None
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz, new_xyz, features
+            )  # (B, C, npoint, nsample)
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], npoint, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], npoint, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], npoint)
+
+            new_features_list.append(new_features)
+
+        return new_xyz, torch.cat(new_features_list, dim=1), inds
+
+
+class PointnetFPModule(nn.Module):
+    r"""Propigates the features of one set to another
+
+    Parameters
+    ----------
+    mlp : list
+        Pointnet module parameters
+    bn : bool
+        Use batchnorm
+    """
+
+    def __init__(self, *, mlp: List[int], bn: bool = True):
+        super().__init__()
+        self.mlp = pt_utils.SharedMLP(mlp, bn=bn)
+
+    def forward(
+            self, unknown: torch.Tensor, known: torch.Tensor,
+            unknow_feats: torch.Tensor, known_feats: torch.Tensor
+    ) -> torch.Tensor:
+        r"""
+        Parameters
+        ----------
+        unknown : torch.Tensor
+            (B, n, 3) tensor of the xyz positions of the unknown features
+        known : torch.Tensor
+            (B, m, 3) tensor of the xyz positions of the known features
+        unknow_feats : torch.Tensor
+            (B, C1, n) tensor of the features to be propigated to
+        known_feats : torch.Tensor
+            (B, C2, m) tensor of features to be propigated
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, mlp[-1], n) tensor of the features of the unknown features
+        """
+
+        if known is not None:
+            dist, idx = pointnet2_utils.three_nn(unknown, known)
+            dist_recip = 1.0 / (dist + 1e-8)
+            norm = torch.sum(dist_recip, dim=2, keepdim=True)
+            weight = dist_recip / norm
+
+            interpolated_feats = pointnet2_utils.three_interpolate(
+                known_feats, idx, weight
+            )
+        else:
+            interpolated_feats = known_feats.expand(
+                *known_feats.size()[0:2], unknown.size(1)
+            )
+
+        if unknow_feats is not None:
+            new_features = torch.cat([interpolated_feats, unknow_feats],
+                                   dim=1)  #(B, C2 + C1, n)
+        else:
+            new_features = interpolated_feats
+
+        new_features = new_features.unsqueeze(-1)
+        new_features = self.mlp(new_features)
+
+        return new_features.squeeze(-1)
+
+class PointnetLFPModuleMSG(nn.Module):
+    ''' Modified based on _PointnetSAModuleBase and PointnetSAModuleMSG
+    learnable feature propagation layer.'''
+
+    def __init__(
+            self,
+            *,
+            mlps: List[List[int]],
+            radii: List[float],
+            nsamples: List[int],
+            post_mlp: List[int],
+            bn: bool = True,
+            use_xyz: bool = True,
+            sample_uniformly: bool = False
+    ):
+        super().__init__()
+
+        assert(len(mlps) == len(nsamples) == len(radii))
+        
+        self.post_mlp = pt_utils.SharedMLP(post_mlp, bn=bn)
+
+        self.groupers = nn.ModuleList()
+        self.mlps = nn.ModuleList()
+        for i in range(len(radii)):
+            radius = radii[i]
+            nsample = nsamples[i]
+            self.groupers.append(
+                pointnet2_utils.QueryAndGroup(radius, nsample, use_xyz=use_xyz,
+                    sample_uniformly=sample_uniformly)
+            )
+            mlp_spec = mlps[i]
+            if use_xyz:
+                mlp_spec[0] += 3
+
+            self.mlps.append(pt_utils.SharedMLP(mlp_spec, bn=bn))
+
+    def forward(self, xyz2: torch.Tensor, xyz1: torch.Tensor,
+                features2: torch.Tensor, features1: torch.Tensor) -> torch.Tensor:
+        r""" Propagate features from xyz1 to xyz2.
+        Parameters
+        ----------
+        xyz2 : torch.Tensor
+            (B, N2, 3) tensor of the xyz coordinates of the features
+        xyz1 : torch.Tensor
+            (B, N1, 3) tensor of the xyz coordinates of the features
+        features2 : torch.Tensor
+            (B, C2, N2) tensor of the descriptors of the the features
+        features1 : torch.Tensor
+            (B, C1, N1) tensor of the descriptors of the the features
+
+        Returns
+        -------
+        new_features1 : torch.Tensor
+            (B, \sum_k(mlps[k][-1]), N1) tensor of the new_features descriptors
+        """
+        new_features_list = []
+
+        for i in range(len(self.groupers)):
+            new_features = self.groupers[i](
+                xyz1, xyz2, features1
+            )  # (B, C1, N2, nsample)
+            new_features = self.mlps[i](
+                new_features
+            )  # (B, mlp[-1], N2, nsample)
+            new_features = F.max_pool2d(
+                new_features, kernel_size=[1, new_features.size(3)]
+            )  # (B, mlp[-1], N2, 1)
+            new_features = new_features.squeeze(-1)  # (B, mlp[-1], N2)
+
+            if features2 is not None:
+                new_features = torch.cat([new_features, features2],
+                                           dim=1)  #(B, mlp[-1] + C2, N2)
+
+            new_features = new_features.unsqueeze(-1)
+            new_features = self.post_mlp(new_features)
+
+            new_features_list.append(new_features)
+
+        return torch.cat(new_features_list, dim=1).squeeze(-1)
+
+
+if __name__ == "__main__":
+    from torch.autograd import Variable
+    torch.manual_seed(1)
+    torch.cuda.manual_seed_all(1)
+    xyz = Variable(torch.randn(2, 9, 3).cuda(), requires_grad=True)
+    xyz_feats = Variable(torch.randn(2, 9, 6).cuda(), requires_grad=True)
+
+    test_module = PointnetSAModuleMSG(
+        npoint=2, radii=[5.0, 10.0], nsamples=[6, 3], mlps=[[9, 3], [9, 6]]
+    )
+    test_module.cuda()
+    print(test_module(xyz, xyz_feats))
+
+    for _ in range(1):
+        _, new_features = test_module(xyz, xyz_feats)
+        new_features.backward(
+            torch.cuda.FloatTensor(*new_features.size()).fill_(1)
+        )
+        print(new_features)
+        print(xyz.grad)
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/pointnet2_utils.py
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/pointnet2_utils.py
@@ -0,0 +1,554 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+''' Modified based on: https://github.com/erikwijmans/Pointnet2_PyTorch '''
+from __future__ import (
+    division,
+    absolute_import,
+    with_statement,
+    print_function,
+    unicode_literals,
+)
+import torch
+from torch.autograd import Function
+import torch.nn as nn
+# import pytorch_utils as pt_utils
+from . import pytorch_utils as pt_utils
+# import sys
+
+try:
+    import builtins
+except:
+    import __builtin__ as builtins
+
+try:
+    from . import pointnet2
+except ImportError:
+    if not getattr(builtins, "__POINTNET2_SETUP__", False):
+        raise ImportError(
+            "Could not import pointnet2 module.\n"
+            "Please see the setup instructions in the README: "
+            "https://github.com/erikwijmans/Pointnet2_PyTorch/blob/master/README.rst"
+        )
+
+if False:
+    # Workaround for type hints without depending on the `typing` module
+    from typing import *
+
+
+class RandomDropout(nn.Module):
+    def __init__(self, p=0.5, inplace=False):
+        super(RandomDropout, self).__init__()
+        self.p = p
+        self.inplace = inplace
+
+    def forward(self, X):
+        theta = torch.Tensor(1).uniform_(0, self.p)[0]
+        return pt_utils.feature_dropout_no_scaling(X, theta, self.train, self.inplace)
+
+
+class FurthestPointSampling(Function):
+    @staticmethod
+    def forward(ctx, xyz, npoint):
+        # type: (Any, torch.Tensor, int) -> torch.Tensor
+        r"""
+        Uses iterative furthest point sampling to select a set of npoint features that have the largest
+        minimum distance
+
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            (B, N, 3) tensor where N > npoint
+        npoint : int32
+            number of features in the sampled set
+
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint) tensor containing the set
+        """
+        return pointnet2.furthest_point_sampling(xyz, npoint)
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+
+
+class GatherOperation(Function):
+    @staticmethod
+    def forward(ctx, features, idx):
+        # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, C, N) tensor
+
+        idx : torch.Tensor
+            (B, npoint) tensor of the features to gather
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, npoint) tensor
+        """
+
+        _, C, N = features.size()
+
+        ctx.for_backwards = (idx, C, N)
+
+        return pointnet2.gather_points(features, idx)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        idx, C, N = ctx.for_backwards
+
+        grad_features = pointnet2.gather_points_grad(grad_out.contiguous(), idx, N)
+        return grad_features, None
+
+
+gather_operation = GatherOperation.apply
+
+
+class ThreeNN(Function):
+    @staticmethod
+    def forward(ctx, unknown, known):
+        # type: (Any, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        r"""
+            Find the three nearest neighbors of unknown in known
+        Parameters
+        ----------
+        unknown : torch.Tensor
+            (B, n, 3) tensor of known features
+        known : torch.Tensor
+            (B, m, 3) tensor of unknown features
+
+        Returns
+        -------
+        dist : torch.Tensor
+            (B, n, 3) l2 distance to the three nearest neighbors
+        idx : torch.Tensor
+            (B, n, 3) index of 3 nearest neighbors
+        """
+        dist2, idx = pointnet2.three_nn(unknown, known)
+
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
+
+
+class ThreeInterpolate(Function):
+    @staticmethod
+    def forward(ctx, features, idx, weight):
+        # type(Any, torch.Tensor, torch.Tensor, torch.Tensor) -> Torch.Tensor
+        r"""
+            Performs weight linear interpolation on 3 features
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, c, m) Features descriptors to be interpolated from
+        idx : torch.Tensor
+            (B, n, 3) three nearest neighbors of the target features in features
+        weight : torch.Tensor
+            (B, n, 3) weights
+
+        Returns
+        -------
+        torch.Tensor
+            (B, c, n) tensor of the interpolated features
+        """
+        B, c, m = features.size()
+        n = idx.size(1)
+
+        ctx.three_interpolate_for_backward = (idx, weight, m)
+
+        return pointnet2.three_interpolate(features, idx, weight)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        # type: (Any, torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        grad_out : torch.Tensor
+            (B, c, n) tensor with gradients of ouputs
+
+        Returns
+        -------
+        grad_features : torch.Tensor
+            (B, c, m) tensor with gradients of features
+
+        None
+
+        None
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+
+        grad_features = pointnet2.three_interpolate_grad(
+            grad_out.contiguous(), idx, weight, m
+        )
+
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
+
+
+class GroupingOperation(Function):
+    @staticmethod
+    def forward(ctx, features, idx):
+        # type: (Any, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        features : torch.Tensor
+            (B, C, N) tensor of features to group
+        idx : torch.Tensor
+            (B, npoint, nsample) tensor containing the indicies of features to group with
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, npoint, nsample) tensor
+        """
+        B, nfeatures, nsample = idx.size()
+        _, C, N = features.size()
+
+        ctx.for_backwards = (idx, N)
+
+        return pointnet2.group_points(features, idx)
+
+    @staticmethod
+    def backward(ctx, grad_out):
+        # type: (Any, torch.tensor) -> Tuple[torch.Tensor, torch.Tensor]
+        r"""
+
+        Parameters
+        ----------
+        grad_out : torch.Tensor
+            (B, C, npoint, nsample) tensor of the gradients of the output from forward
+
+        Returns
+        -------
+        torch.Tensor
+            (B, C, N) gradient of the features
+        None
+        """
+        idx, N = ctx.for_backwards
+
+        grad_features = pointnet2.group_points_grad(grad_out.contiguous(), idx, N)
+
+        return grad_features, None
+
+
+grouping_operation = GroupingOperation.apply
+
+
+class BallQuery(Function):
+    @staticmethod
+    def forward(ctx, radius, nsample, xyz, new_xyz):
+        # type: (Any, float, int, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        radius : float
+            radius of the balls
+        nsample : int
+            maximum number of features in the balls
+        xyz : torch.Tensor
+            (B, N, 3) xyz coordinates of the features
+        new_xyz : torch.Tensor
+            (B, npoint, 3) centers of the ball query
+
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+        """
+        return pointnet2.ball_query(new_xyz, xyz, radius, nsample)
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
+
+
+class QueryAndGroup(nn.Module):
+    r"""
+    Groups with a ball query of radius
+
+    Parameters
+    ---------
+    radius : float32
+        Radius of ball
+    nsample : int32
+        Maximum number of features to gather in the ball
+    """
+
+    def __init__(self, radius, nsample, use_xyz=True, ret_grouped_xyz=False, normalize_xyz=False, sample_uniformly=False, ret_unique_cnt=False):
+        # type: (QueryAndGroup, float, int, bool) -> None
+        super(QueryAndGroup, self).__init__()
+        self.radius, self.nsample, self.use_xyz = radius, nsample, use_xyz
+        self.ret_grouped_xyz = ret_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.sample_uniformly = sample_uniformly
+        self.ret_unique_cnt = ret_unique_cnt
+        if self.ret_unique_cnt:
+            assert(self.sample_uniformly)
+
+    def forward(self, xyz, new_xyz, features=None):
+        # type: (QueryAndGroup, torch.Tensor. torch.Tensor, torch.Tensor) -> Tuple[Torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            centriods (B, npoint, 3)
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, 3 + C, npoint, nsample) tensor
+        """
+        idx = ball_query(self.radius, self.nsample, xyz, new_xyz)
+
+        if self.sample_uniformly:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(0, num_unique, (self.nsample - num_unique,), dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+
+        xyz_trans = xyz.transpose(1, 2).contiguous()
+        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
+        if self.normalize_xyz:
+            grouped_xyz /= self.radius
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert (
+                self.use_xyz
+            ), "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+
+        ret = [new_features]
+        if self.ret_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.ret_unique_cnt:
+            ret.append(unique_cnt)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
+
+
+class GroupAll(nn.Module):
+    r"""
+    Groups all features
+
+    Parameters
+    ---------
+    """
+
+    def __init__(self, use_xyz=True, ret_grouped_xyz=False):
+        # type: (GroupAll, bool) -> None
+        super(GroupAll, self).__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self, xyz, new_xyz, features=None):
+        # type: (GroupAll, torch.Tensor, torch.Tensor, torch.Tensor) -> Tuple[torch.Tensor]
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            Ignored
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, C + 3, 1, N) tensor
+        """
+
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, 3 + C, 1, N)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        if self.ret_grouped_xyz:
+            return new_features, grouped_xyz
+        else:
+            return new_features
+
+
+class CylinderQuery(Function):
+    @staticmethod
+    def forward(ctx, radius, hmin, hmax, nsample, xyz, new_xyz, rot):
+        # type: (Any, float, float, float, int, torch.Tensor, torch.Tensor, torch.Tensor) -> torch.Tensor
+        r"""
+
+        Parameters
+        ----------
+        radius : float
+            radius of the cylinders
+        hmin, hmax : float
+            endpoints of cylinder height in x-rotation axis
+        nsample : int
+            maximum number of features in the cylinders
+        xyz : torch.Tensor
+            (B, N, 3) xyz coordinates of the features
+        new_xyz : torch.Tensor
+            (B, npoint, 3) centers of the cylinder query
+        rot: torch.Tensor
+            (B, npoint, 9) flatten rotation matrices from
+                           cylinder frame to world frame
+
+        Returns
+        -------
+        torch.Tensor
+            (B, npoint, nsample) tensor with the indicies of the features that form the query balls
+        """
+        return pointnet2.cylinder_query(new_xyz, xyz, rot, radius, hmin, hmax, nsample)
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None, None, None, None, None
+
+
+cylinder_query = CylinderQuery.apply
+
+
+class CylinderQueryAndGroup(nn.Module):
+    r"""
+    Groups with a cylinder query of radius and height
+
+    Parameters
+    ---------
+    radius : float32
+        Radius of cylinder
+    hmin, hmax: float32
+        endpoints of cylinder height in x-rotation axis
+    nsample : int32
+        Maximum number of features to gather in the ball
+    """
+
+    def __init__(self, radius, hmin, hmax, nsample, use_xyz=True, ret_grouped_xyz=False, normalize_xyz=False, rotate_xyz=True, sample_uniformly=False, ret_unique_cnt=False):
+        super(CylinderQueryAndGroup, self).__init__()
+        self.radius, self.nsample, self.hmin, self.hmax, = radius, nsample, hmin, hmax
+        self.use_xyz = use_xyz
+        self.ret_grouped_xyz = ret_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.rotate_xyz = rotate_xyz
+        self.sample_uniformly = sample_uniformly
+        self.ret_unique_cnt = ret_unique_cnt
+        if self.ret_unique_cnt:
+            assert(self.sample_uniformly)
+
+    def forward(self, xyz, new_xyz, rot, features=None):
+        r"""
+        Parameters
+        ----------
+        xyz : torch.Tensor
+            xyz coordinates of the features (B, N, 3)
+        new_xyz : torch.Tensor
+            centriods (B, npoint, 3)
+        rot : torch.Tensor
+            rotation matrices (B, npoint, 3, 3)
+        features : torch.Tensor
+            Descriptors of the features (B, C, N)
+
+        Returns
+        -------
+        new_features : torch.Tensor
+            (B, 3 + C, npoint, nsample) tensor
+        """
+        B, npoint, _ = new_xyz.size()
+        idx = cylinder_query(self.radius, self.hmin, self.hmax, self.nsample, xyz, new_xyz, rot.view(B, npoint, 9))
+
+        if self.sample_uniformly:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(0, num_unique, (self.nsample - num_unique,), dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+
+        xyz_trans = xyz.transpose(1, 2).contiguous()
+        grouped_xyz = grouping_operation(xyz_trans, idx)  # (B, 3, npoint, nsample)
+        grouped_xyz -= new_xyz.transpose(1, 2).unsqueeze(-1)
+        if self.normalize_xyz:
+            grouped_xyz /= self.radius
+        if self.rotate_xyz:
+            grouped_xyz_ = grouped_xyz.permute(0, 2, 3, 1).contiguous() # (B, npoint, nsample, 3)
+            grouped_xyz_ = torch.matmul(grouped_xyz_, rot)
+            grouped_xyz = grouped_xyz_.permute(0, 3, 1, 2).contiguous()
+
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                new_features = torch.cat(
+                    [grouped_xyz, grouped_features], dim=1
+                )  # (B, C + 3, npoint, nsample)
+            else:
+                new_features = grouped_features
+        else:
+            assert (
+                self.use_xyz
+            ), "Cannot have not features and not use xyz as a feature!"
+            new_features = grouped_xyz
+
+        ret = [new_features]
+        if self.ret_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.ret_unique_cnt:
+            ret.append(unique_cnt)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/pytorch_utils.py
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/pytorch_utils.py
@@ -0,0 +1,298 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+''' Modified based on Ref: https://github.com/erikwijmans/Pointnet2_PyTorch '''
+# import torch
+import torch.nn as nn
+from typing import List, Tuple
+
+class SharedMLP(nn.Sequential):
+
+    def __init__(
+            self,
+            args: List[int],
+            *,
+            bn: bool = False,
+            activation=nn.ReLU(inplace=True),
+            preact: bool = False,
+            first: bool = False,
+            name: str = ""
+    ):
+        super().__init__()
+
+        for i in range(len(args) - 1):
+            self.add_module(
+                name + 'layer{}'.format(i),
+                Conv2d(
+                    args[i],
+                    args[i + 1],
+                    bn=(not first or not preact or (i != 0)) and bn,
+                    activation=activation
+                    if (not first or not preact or (i != 0)) else None,
+                    preact=preact
+                )
+            )
+
+
+class _BNBase(nn.Sequential):
+
+    def __init__(self, in_size, batch_norm=None, name=""):
+        super().__init__()
+        self.add_module(name + "bn", batch_norm(in_size))
+
+        nn.init.constant_(self[0].weight, 1.0)
+        nn.init.constant_(self[0].bias, 0)
+
+
+class BatchNorm1d(_BNBase):
+
+    def __init__(self, in_size: int, *, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm1d, name=name)
+
+
+class BatchNorm2d(_BNBase):
+
+    def __init__(self, in_size: int, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm2d, name=name)
+
+
+class BatchNorm3d(_BNBase):
+
+    def __init__(self, in_size: int, name: str = ""):
+        super().__init__(in_size, batch_norm=nn.BatchNorm3d, name=name)
+
+
+class _ConvBase(nn.Sequential):
+
+    def __init__(
+            self,
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=None,
+            batch_norm=None,
+            bias=True,
+            preact=False,
+            name=""
+    ):
+        super().__init__()
+
+        bias = bias and (not bn)
+        conv_unit = conv(
+            in_size,
+            out_size,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias
+        )
+        init(conv_unit.weight)
+        if bias:
+            nn.init.constant_(conv_unit.bias, 0)
+
+        if bn:
+            if not preact:
+                bn_unit = batch_norm(out_size)
+            else:
+                bn_unit = batch_norm(in_size)
+
+        if preact:
+            if bn:
+                self.add_module(name + 'bn', bn_unit)
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+        self.add_module(name + 'conv', conv_unit)
+
+        if not preact:
+            if bn:
+                self.add_module(name + 'bn', bn_unit)
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+
+class Conv1d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: int = 1,
+            stride: int = 1,
+            padding: int = 0,
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv1d,
+            batch_norm=BatchNorm1d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class Conv2d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: Tuple[int, int] = (1, 1),
+            stride: Tuple[int, int] = (1, 1),
+            padding: Tuple[int, int] = (0, 0),
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv2d,
+            batch_norm=BatchNorm2d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class Conv3d(_ConvBase):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            kernel_size: Tuple[int, int, int] = (1, 1, 1),
+            stride: Tuple[int, int, int] = (1, 1, 1),
+            padding: Tuple[int, int, int] = (0, 0, 0),
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=nn.init.kaiming_normal_,
+            bias: bool = True,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__(
+            in_size,
+            out_size,
+            kernel_size,
+            stride,
+            padding,
+            activation,
+            bn,
+            init,
+            conv=nn.Conv3d,
+            batch_norm=BatchNorm3d,
+            bias=bias,
+            preact=preact,
+            name=name
+        )
+
+
+class FC(nn.Sequential):
+
+    def __init__(
+            self,
+            in_size: int,
+            out_size: int,
+            *,
+            activation=nn.ReLU(inplace=True),
+            bn: bool = False,
+            init=None,
+            preact: bool = False,
+            name: str = ""
+    ):
+        super().__init__()
+
+        fc = nn.Linear(in_size, out_size, bias=not bn)
+        if init is not None:
+            init(fc.weight)
+        if not bn:
+            nn.init.constant_(fc.bias, 0)
+
+        if preact:
+            if bn:
+                self.add_module(name + 'bn', BatchNorm1d(in_size))
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+        self.add_module(name + 'fc', fc)
+
+        if not preact:
+            if bn:
+                self.add_module(name + 'bn', BatchNorm1d(out_size))
+
+            if activation is not None:
+                self.add_module(name + 'activation', activation)
+
+def set_bn_momentum_default(bn_momentum):
+
+    def fn(m):
+        if isinstance(m, (nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d)):
+            m.momentum = bn_momentum
+
+    return fn
+
+
+class BNMomentumScheduler(object):
+
+    def __init__(
+            self, model, bn_lambda, last_epoch=-1,
+            setter=set_bn_momentum_default
+    ):
+        if not isinstance(model, nn.Module):
+            raise RuntimeError(
+                "Class '{}' is not a PyTorch nn Module".format(
+                    type(model).__name__
+                )
+            )
+
+        self.model = model
+        self.setter = setter
+        self.lmbd = bn_lambda
+
+        self.step(last_epoch + 1)
+        self.last_epoch = last_epoch
+
+    def step(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+
+        self.last_epoch = epoch
+        self.model.apply(self.setter(self.lmbd(epoch)))
+
+
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/setup.py
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/setup.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
+import glob
+import os
+
+
+ROOT = os.path.dirname(os.path.abspath(__file__))
+
+_ext_src_root = "_ext_src"
+
+_ext_cpp_sources = glob.glob("{}/src/*.cpp".format(_ext_src_root))
+_ext_cu_sources = glob.glob("{}/src/*.cu".format(_ext_src_root))
+
+_ext_cuda_sources = _ext_cpp_sources + _ext_cu_sources
+
+
+def is_cuda_available():
+    # return False
+    try:
+        import torch
+        return torch.cuda.is_available() and hasattr(torch.utils.cpp_extension, "CUDAExtension")
+    except (ImportError, AttributeError, RuntimeError):
+        return False
+
+_ext_modules = []
+if is_cuda_available():
+    _ext_modules.append(
+        CUDAExtension(
+            name='pointnet2.pointnet2',
+            sources=_ext_cuda_sources,
+            extra_compile_args={
+                "cxx": ["-O2", "-DWITH_CUDA", "-I{}".format("{}/{}/include".format(ROOT, _ext_src_root))],
+                "nvcc": ["-O2", "-I{}".format("{}/{}/include".format(ROOT, _ext_src_root))],
+            },
+        )
+    )
+else:
+    _ext_modules.append(
+        CppExtension(
+            name='pointnet2.pointnet2',
+            sources=_ext_cpp_sources,
+            extra_compile_args={
+                "cxx": ["-O2", "-I{}".format("{}/{}/include".format(ROOT, _ext_src_root))],
+            },
+        )
+    )
+
+
+setup(
+    name='pointnet2',
+    ext_modules=_ext_modules,
+    cmdclass={
+        'build_ext': BuildExtension
+    }
+)
+
--- a/vision_detect/vision_detect/VisionDetect/net/pointnet2/test.py
+++ b/vision_detect/vision_detect/VisionDetect/net/pointnet2/test.py
@@ -0,0 +1,28 @@
+import os
+# 启用2个关键CUDA调试环境变量
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # 强制CUDA核函数同步执行，错误信息实时返回
+os.environ["TORCH_USE_CUDA_DSA"] = "1" 
+
+import torch
+import pointnet2
+
+# 1. 构造简单的点云张量（避免空张量、形状异常）
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# device = torch.device("cpu")
+print("使用设备：", device)
+
+# 点云数据：(batch_size, 3, num_points)
+points = torch.randn(1, 3, 1024, device=device, dtype=torch.float32)
+# 中心点数据：(batch_size, 3, num_centers)
+centers = torch.randn(1, 3, 128, device=device, dtype=torch.float32)
+# 半径和近邻数（符合扩展函数预期）
+radius = 0.1
+k = 32
+
+try:
+    # 调用扩展函数
+    idx = pointnet2.ball_query(centers, points, radius, k)
+    print("扩展函数执行成功，输出形状：", idx.shape)
+    print("扩展函数执行正常，无内存错误！")
+except Exception as e:
+    print(f"扩展函数执行失败：{e}")
--- a/vision_detect/vision_detect/VisionDetect/node/config_node.py
+++ b/vision_detect/vision_detect/VisionDetect/node/config_node.py
@@ -6,9 +6,7 @@ from rclpy.node import Node

 import numpy as np
 import open3d as o3d
-
-from ..utils import read_calibration_mat
-
+import yaml

 share_dir = get_package_share_directory('vision_detect')

@@ -16,7 +14,14 @@ __all__ = [
    "ConfigBase"
 ]

+
 class ConfigBase(Node):
+    SHARE_DIR = get_package_share_directory('vision_detect')
+    with open(os.path.join(
+            SHARE_DIR, "configs/error_configs/report_logging_define.json"), "r"
+    ) as f:
+        WARNING_LOG_MAP = json.load(f)["warring"]
+
    def __init__(self, name):
        super().__init__(name)
        """init parameter"""
@@ -53,9 +58,13 @@ class ConfigBase(Node):
        self.camera_size = None
        self.position = None

+        self.e2e_model = None
+
        self.map1 = self.map2 = None

        self.calculate_function = None
+
+
        self.fx = self.fy = 0.5
        self.camera_data = {}
        self.distance = 1500
@@ -82,7 +91,7 @@ class ConfigBase(Node):
        self.detect_mode = self.configs['detect_mode']
        self.calculate_mode = self.configs['calculate_mode']

-        self._read_calibration_mat()
+        # self._read_calibration_mat()

        if self.get_camera_mode == "Service":
            self.service_name = self.configs["Service_configs"]["service_name"]
@@ -126,29 +135,10 @@ class ConfigBase(Node):
                os.path.join(share_dir, self.calculate_configs['complete_model_path'])
            )
            self.calculate_configs["source"] = source
-
+        elif self.calculate_mode == "E2E" and self.detect_mode == 'Detect':
+            self.calculate_configs = self.configs['E2E_configs']
        else:
            self.get_logger().warning("Unknown calculate_mode, use PCA")
            self.calculate_configs = self.configs['PCA_configs']

        self.get_logger().info("Get parameters done")
-
-    def _read_calibration_mat(self):
-        eye_in_left_hand_path = os.path.join(share_dir, self.configs["calibration"]["left_hand"])
-        eye_in_right_hand_path = os.path.join(share_dir, self.configs["calibration"]["right_hand"])
-        eye_to_hand_path = os.path.join(share_dir, self.configs["calibration"]["head"])
-
-        self.eye_in_left_hand_mat, info, sign = read_calibration_mat(eye_in_left_hand_path)
-        if not sign:
-            self.get_logger().warning(f"left_mat: {info}")
-
-        self.eye_in_right_hand_mat, info, sign = read_calibration_mat(eye_in_right_hand_path)
-        if not sign:
-            self.get_logger().warning(f"right_mat: {info}")
-
-        self.eye_to_hand_mat, info, sign = read_calibration_mat(eye_to_hand_path)
-        if not sign:
-            self.get_logger().warning(f"head_mat: {info}")
-
-        self.get_logger().info("Read calibration mat file done")
-
--- a/vision_detect/vision_detect/VisionDetect/node/detect_node.py
+++ b/vision_detect/vision_detect/VisionDetect/node/detect_node.py
@@ -14,10 +14,18 @@ from geometry_msgs.msg import Pose, Point, Quaternion

 from interfaces.msg import PoseClassAndID, PoseArrayClassAndID

-from ..utils import *
+from ..utils import distortion_correction, crop_imgs_box_xywh, draw_box, draw_mask, rmat2quat, \
+    crop_imgs_mask, get_map, create_o3d_pcd, save_img, refine_grasp_pose
 from .init_node import InitBase


+E2E_SIGN = True
+try:
+    from ..utils import calculate_pose_e2e
+except (ImportError, ModuleNotFoundError):
+    E2E_SIGN = False
+
+
 class DetectNode(InitBase):
    """Detect Node"""
    def __init__(self, name):
@@ -55,10 +63,11 @@ class DetectNode(InitBase):
        """同步回调函数"""
        color_img_cv = self.cv_bridge.imgmsg_to_cv2(color_img_ros, "bgr8")
        depth_img_cv = self.cv_bridge.imgmsg_to_cv2(depth_img_ros, '16UC1')
-
        color_img_cv, depth_img_cv = distortion_correction(color_img_cv, depth_img_cv, self.map1, self.map2)

-        img, pose_list = self.function(color_img_cv, depth_img_cv)
+        img, pose_list, sign = self.function(color_img_cv, depth_img_cv)
+        if not sign:
+            self.get_logger().warning(self.WARNING_LOG_MAP[str(pose_list)])

        # masks为空，结束这一帧
        if img is None:
@@ -83,7 +92,8 @@ class DetectNode(InitBase):
            self.pub_pose_list.publish(pose_list_all)

    def _service_callback(self, request, response):
-        # self.get_logger().info("service callback start")
+        print(" \n ")
+        print("========================== < start > ==========================")
        response.header.stamp = self.get_clock().now().to_msg()
        response.header.frame_id = "camera_detect"

@@ -95,23 +105,24 @@ class DetectNode(InitBase):
                    response.success = False
                    response.info = "Camera data have not objects"
                    response.objects = []
-
-                    # self.get_logger().info("service callback done")
+                    print("=========================== < end > ===========================")
                    return response

                response.success = False
                response.info = f"Name is wrong: {request.camera_position}"
                response.objects = []
-
-                # self.get_logger().info("service callback done")
+                print("=========================== < end > ===========================")
                return response

        if request.camera_position == 'left':
            self.hand_eye_mat = self.eye_in_left_hand_mat
+            self.p = "left"
        elif request.camera_position == 'right':
            self.hand_eye_mat = self.eye_in_right_hand_mat
+            self.p = "right"
        else:
            self.hand_eye_mat = self.eye_to_hand_mat
+            self.p = "head"

        self.camera_size = [color_img_ros.width, color_img_ros.height]

@@ -121,15 +132,15 @@ class DetectNode(InitBase):
        map1, map2, self.k = get_map(self.k, d, self.camera_size)
        color_img_cv, depth_img_cv = distortion_correction(color_img_cv, depth_img_cv, map1, map2)

-        img, pose_list = self.function(color_img_cv, depth_img_cv)
+        img, pose_list, sign = self.function(color_img_cv, depth_img_cv)
+        if not sign:
+            self.get_logger().warning(self.WARNING_LOG_MAP[str(pose_list)])

-        # masks为空，结束这一帧
-        if self.output_boxes or self.output_masks:
-            if img is None:
-                img = color_img_ros
-            self.pub_detect_image.publish(img)
+            response.info = "pose dict is empty"
+            response.success = False
+            response.objects = []

-        if pose_list:
+        else:
            response.info = "Success get pose"
            response.success = True
            for item in pose_list:
@@ -141,19 +152,23 @@ class DetectNode(InitBase):
                        grab_width = item["grab_width"]
                    )
                )
-        else:
-            response.info = "pose dict is empty"
-            response.success = False
-            response.objects = []

-        # self.get_logger().info("service callback done")
+        # publish detect image
+        if self.output_boxes or self.output_masks:
+            if img is None:
+                img = color_img_ros
+            self.pub_detect_image.publish(img)
+        print("=========================== < end > ===========================")
        return response

    def _seg_image(self, rgb_img: np.ndarray, depth_img: np.ndarray):
        """Use segmentation model"""
-        self.get_logger().info('start')
        pose_list = []

+        home = os.path.expanduser("~")
+        save_dir = os.path.join(home, "images")
+        save_img(rgb_img.copy(), "orign_image.png", save_dir=save_dir, mark_cur_time=True)
+
        # Get Predict Results
        time1 = time.time()
        results = self.model(rgb_img, retina_masks=True, conf=self.confidence, classes=self.classes)
@@ -162,8 +177,7 @@ class DetectNode(InitBase):

        # Get masks
        if result.masks is None or len(result.masks) == 0:
-            self.get_logger().info(f"Detect object num: 0")
-            return None, None
+            return None, 1000, False
        masks = result.masks.data.cpu().numpy()

        # Get boxes
@@ -171,81 +185,96 @@ class DetectNode(InitBase):
        class_ids = result.boxes.cls.cpu().numpy()
        labels = result.names

+        x_centers, y_centers = boxes[:, 0], boxes[:, 1]
+        sorted_index = np.lexsort((-y_centers, x_centers))
+        masks = masks[sorted_index]
+        boxes = boxes[sorted_index]
+        class_ids = class_ids[sorted_index]
+
        time3 = time.time()

        self.get_logger().info(f"Detect object num: {len(masks)}")

+        full_points = create_o3d_pcd(
+            depth_img, self.camera_size, self.k, **self.calculate_configs
+        )
+        time_full_points = time.time()
+        print(f"create full points: {(time_full_points - time3) * 1000}")
+
+        if self.calculate_mode == "E2E" and self.detect_mode == 'Detect' and E2E_SIGN:
+            self.calculate_configs["orign_point_clouds"] = create_o3d_pcd(
+                depth_img, self.camera_size, self.k, **self.calculate_configs
+            )
+
        for i, (mask, box) in enumerate(zip(masks, boxes)):
            imgs_crop, (x_min, y_min) = crop_imgs_box_xywh([depth_img, mask], box, True)
            depth_crop, mask_crop = imgs_crop

            if depth_crop is None:
+                self.get_logger().warning("Depth crop is None")
                continue

            intrinsics = o3d.camera.PinholeCameraIntrinsic(
-                int(self.camera_size[0]),
-                int(self.camera_size[1]),
-                self.k[0],
-                self.k[4],
-                self.k[2] - x_min,
-                self.k[5] - y_min
+                int(self.camera_size[0]), int(self.camera_size[1]),
+                self.k[0], self.k[4],
+                self.k[2] - x_min, self.k[5] - y_min
            )

-            rmat, grab_width = self.calculate_function(
-                mask_crop,
-                depth_crop,
-                intrinsics,
-                calculate_grab_width=True,
-                **self.calculate_configs
+            rmat, grab_width, sign = self.calculate_function(
+                mask_crop, depth_crop, intrinsics,
+                calculate_grab_width=True, **self.calculate_configs
            )
-            if rmat is None:
-                self.get_logger().warning("Object point cloud have too many noise")
+            if not sign:
+                self.get_logger().warning(self.WARNING_LOG_MAP[str(rmat)])
                continue

-            self.get_logger().info(f"grab_width: {grab_width}")
-
-            rmat = self.hand_eye_mat @ rmat
-
-            x, y, z, rw, rx, ry, rz = rmat2quat(rmat)
-
-            if (x, y, z) != (0.0, 0.0, 0.0):
-                pose = Pose()
-                pose.position = Point(x=x, y=y, z=z)
-                pose.orientation = Quaternion(w=rw, x=rx, y=ry, z=rz)
-                pose_list.append(
-                    {
-                        "class_id": int(class_ids[i]),
-                        "class_name": labels[class_ids[i]],
-                        "pose": pose,
-                        "grab_width": grab_width
-                    }
+            time_c = time.time()
+            if self.p == "left" or self.p == "right":
+                position = rmat[0:3, 3]
+                rmat, sign = refine_grasp_pose(
+                    full_points, self.calculate_configs.get("voxel_size"), position,
+                    search_mode=True
                )
+                if not sign:
+                    self.get_logger().warning(self.WARNING_LOG_MAP[str(rmat)])
+                    continue
+
+            time_e = time.time()
+            print(f"Refine: {(time_e - time_c) * 1000} ms")
+
+            self.get_logger().info(f"grab_width: {grab_width}")
+            x, y, z, rw, rx, ry, rz = rmat2quat(self.hand_eye_mat @ rmat)
+
+            pose = Pose()
+            pose.position = Point(x=x, y=y, z=z)
+            pose.orientation = Quaternion(w=rw, x=rx, y=ry, z=rz)
+            self.get_logger().info(f"xyz, wxyz: {x, y, z, rw, rx, ry, rz}")
+            pose_list.append(
+                {
+                    "class_id": int(class_ids[i]),
+                    "class_name": labels[class_ids[i]],
+                    "pose": pose,
+                    "grab_width": grab_width
+                }
+            )

        time4 = time.time()

+        if not pose_list:
+            return None, 1003, False

        self.get_logger().info(f'{(time2 - time1) * 1000} ms, model predict')
-        # self.get_logger().info(f'{(time3 - time2) * 1000} ms, get mask and some param')
        self.get_logger().info(f'{(time4 - time3) * 1000} ms, calculate all mask PCA')
        self.get_logger().info(f'{(time4 - time1) * 1000} ms, completing a picture entire process')
-        self.get_logger().info('end')

        # mask_img and box_img is or not output
-        if self.output_boxes and not self.output_masks:
-            home = os.path.expanduser("~")
-            save_path = os.path.join(home, "detect_image.png")
-            draw_box(rgb_img, result, save_path=save_path)
-            # draw_box(rgb_img, result, save_path=False)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_list
-        elif self.output_boxes and self.output_masks:
-            draw_box(rgb_img, result)
+        if not self.output_boxes and not self.output_masks:
+            return None, pose_list, True
+        if self.output_boxes:
+            draw_box(rgb_img, result, save_dir=save_dir, mark_time = True)
+        if self.output_masks:
            draw_mask(rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_list
-        elif not self.output_boxes and self.output_masks:
-            draw_mask(rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_list
-        else:
-            return None, pose_list
+        return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_list, True

    def _seg_color(self, rgb_img: np.ndarray, depth_img: np.ndarray):
        """Use segmentation model"""
@@ -268,31 +297,26 @@ class DetectNode(InitBase):
        depth_crop = imgs_crop[0]

        if depth_crop is None:
-            self.get_logger().warning("depth_crop is None")
-            return None, None
+            self.get_logger().warning(self.WARNING_LOG_MAP[str(1001)])
+            return None, 1001, False

        intrinsics = o3d.camera.PinholeCameraIntrinsic(
-            int(self.camera_size[0]),
-            int(self.camera_size[1]),
-            self.k[0],
-            self.k[4],
-            self.k[2] - x_min,
-            self.k[5] - y_min
+            int(self.camera_size[0]), int(self.camera_size[1]),
+            self.k[0], self.k[4],
+            self.k[2] - x_min, self.k[5] - y_min
        )

-        rmat, _ = self.calculate_function(
-            mask_crop,
-            depth_crop,
-            intrinsics,
-            **self.calculate_configs
+        rmat, _, sign = self.calculate_function(
+            mask_crop, depth_crop, intrinsics, **self.calculate_configs
        )
-        if rmat is None:
-            self.get_logger().warning("Color Area point cloud have too many noise")
-            return None, None
+        if not sign:
+            # self.get_logger().warning("Color Area point cloud have too many noise")
+            return None, 1100, False

-        rmat = self.hand_eye_mat @ rmat

-        x, y, z, rw, rx, ry, rz = rmat2quat(rmat)
+        T = self.hand_eye_mat @ rmat
+
+        x, y, z, rw, rx, ry, rz = rmat2quat(T)

        if (x, y, z) != (0.0, 0.0, 0.0):
            pose = Pose()
@@ -307,12 +331,13 @@ class DetectNode(InitBase):
                }
            )

-        return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_list
+        return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_list, True

    def _seg_crossboard(self, rgb_img, depth_img):
        pose_list = []
        rgb_img_gray = cv2.cvtColor(rgb_img, cv2.COLOR_BGR2GRAY)
-        ret, corners = cv2.findChessboardCorners(rgb_img_gray, self.pattern_size, cv2.CALIB_CB_FAST_CHECK)
+        ret, corners = cv2.findChessboardCorners(rgb_img_gray,
+                                                 self.pattern_size, cv2.CALIB_CB_FAST_CHECK)
        if ret:
            # 角点亚像素精确化（提高标定精度）
            criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001)
@@ -337,64 +362,50 @@ class DetectNode(InitBase):
            depth_crop = img_crop[0]

            if depth_crop is None:
-                return None, None
+                self.get_logger().warning(self.WARNING_LOG_MAP[str(1001)])
+                return None, 1001, False

            intrinsics = o3d.camera.PinholeCameraIntrinsic(
-                int(self.camera_size[0]),
-                int(self.camera_size[1]),
-                self.k[0],
-                self.k[4],
-                self.k[2] - x_min,
-                self.k[5] - y_min
+                int(self.camera_size[0]), int(self.camera_size[1]),
+                self.k[0], self.k[4],
+                self.k[2] - x_min, self.k[5] - y_min
            )

-            rmat, _ = self.calculate_function(
-                mask_crop,
-                depth_crop,
-                intrinsics,
-                **self.calculate_configs
+            rmat, _, sign = self.calculate_function(
+                mask_crop, depth_crop, intrinsics, **self.calculate_configs
            )

-            if rmat is None:
-                self.get_logger().warning("Corssboard point cloud have too many noise")
-                return None, None
+            if not sign:
+                # self.get_logger().warning("Corssboard point cloud have too many noise")
+                return None, 1100, False

            x, y, z, rw, rx, ry, rz = rmat2quat(rmat)

            self.get_logger().info(f"{x}, {y}, {z}, {rw}, {rx}, {ry}, {rz}")

-            if (x, y, z) != (0.0, 0.0, 0.0):
-                pose = Pose()
-                pose.position = Point(x=x, y=y, z=z)
-                pose.orientation = Quaternion(w=rw, x=rx, y=ry, z=rz)
-                pose_list.append(
-                    {
-                        "class_id": int(99),
-                        "class_name": 'crossboard',
-                        "pose": pose,
-                        "grab_width": 0.0
-                    }
-                )
+            pose = Pose()
+            pose.position = Point(x=x, y=y, z=z)
+            pose.orientation = Quaternion(w=rw, x=rx, y=ry, z=rz)
+            pose_list.append(
+                {
+                    "class_id": int(99),
+                    "class_name": 'crossboard',
+                    "pose": pose,
+                    "grab_width": 0.0
+                }
+            )

            cv2.putText(
-                rgb_img,
-                f'cs: x: {x:.3f}, y: {y:.3f}, z: {z:.3f}',
-                (0, 0 + 30),cv2.FONT_HERSHEY_SIMPLEX,
-                1,
-                (255, 255, 0),
-                2
+                rgb_img, f'cs: x: {x:.3f}, y: {y:.3f}, z: {z:.3f}',
+                (0, 0 + 30),cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2
            )
            cv2.putText(
-                rgb_img,
-                f'quat: rw: {rw:.3f}, rx: {rx:.3f}, ry: {ry:.3f}, rz: {rz:.3f}',
-                (0, 0 + 70), cv2.FONT_HERSHEY_SIMPLEX,
-                1,
-                (255, 255, 0),
-                2
+                rgb_img, f'quat: rw: {rw:.3f}, rx: {rx:.3f}, ry: {ry:.3f}, rz: {rz:.3f}',
+                (0, 0 + 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2
            )
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_list
+            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_list, True
            
-        return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), None
+        return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), None, True


 if __name__ == '__main__':
--- a/vision_detect/vision_detect/VisionDetect/node/init_node.py
+++ b/vision_detect/vision_detect/VisionDetect/node/init_node.py
@@ -1,3 +1,4 @@
+import os
 from ament_index_python.packages import get_package_share_directory

 import torch
@@ -12,15 +13,22 @@ from sensor_msgs.msg import Image, CameraInfo
 from interfaces.msg import PoseArrayClassAndID, ImgMsg
 from interfaces.srv import VisionObjectRecognition

-from ..utils import calculate_pose_pca, calculate_pose_icp
+from ..utils import calculate_pose_pca, calculate_pose_icp, read_calibration_mat
 from .config_node import ConfigBase

-share_dir = get_package_share_directory('vision_detect')
+E2E_SIGN = True
+try:
+    from ..net import GraspNet
+    from ..utils import calculate_pose_e2e
+except (ImportError, ModuleNotFoundError):
+    E2E_SIGN = False
+

 class InitBase(ConfigBase):
    def __init__(self, name):
        super().__init__(name)
        self.future = Future()
+        self._read_calibration_mat()

        if self.get_camera_mode == "Service":
            self._init_service()
@@ -53,6 +61,19 @@ class InitBase(ConfigBase):
            self.calculate_function = calculate_pose_pca
        elif self.calculate_mode == "ICP" and self.detect_mode == 'Detect':
            self.calculate_function = calculate_pose_icp
+        elif self.calculate_mode == "E2E" and self.detect_mode == 'Detect':
+            self.calculate_function = calculate_pose_e2e
+            self.e2e_model = GraspNet(seed_feat_dim=512, is_training=False)
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            self.e2e_model.to(device)
+            self.e2e_model.load_state_dict(
+                torch.load(
+                    os.path.join(self.SHARE_DIR, self.calculate_configs["checkpoint_path"]),
+                    map_location=device
+                )
+            )
+            self.e2e_model.eval()
+            self.calculate_configs["model"] = self.e2e_model
        else:
            self.calculate_function = calculate_pose_pca

@@ -145,6 +166,28 @@ class InitBase(ConfigBase):

        self.get_logger().info("Initialize subscriber done")

+    def _read_calibration_mat(self):
+        eye_in_left_hand_path = os.path.join(self.SHARE_DIR, self.configs["calibration"]["left_hand"])
+        eye_in_right_hand_path = os.path.join(self.SHARE_DIR, self.configs["calibration"]["right_hand"])
+        eye_to_hand_path = os.path.join(self.SHARE_DIR, self.configs["calibration"]["head"])
+
+        self.eye_in_left_hand_mat, info, sign = read_calibration_mat(eye_in_left_hand_path)
+        self.get_logger().info(f"left_hand_mat: {self.eye_in_left_hand_mat}")
+        if not sign:
+            self.get_logger().warning(f"left_mat: {info}")
+
+        self.eye_in_right_hand_mat, info, sign = read_calibration_mat(eye_in_right_hand_path)
+        self.get_logger().info(f"right_hand_mat: {self.eye_in_right_hand_mat}")
+        if not sign:
+            self.get_logger().warning(f"right_mat: {info}")
+
+        self.eye_to_hand_mat, info, sign = read_calibration_mat(eye_to_hand_path)
+        self.get_logger().info(f"head_mat: {self.eye_to_hand_mat}")
+        if not sign:
+            self.get_logger().warning(f"head_mat: {info}")
+
+        self.get_logger().info("Read calibration mat file done")
+
    def _camera_info_callback(self, msg: CameraInfo):
        pass

--- a/vision_detect/vision_detect/VisionDetect/utils/init.py
+++ b/vision_detect/vision_detect/VisionDetect/utils/init.py
@@ -2,3 +2,5 @@ from .image_tools import *
 from .draw_tools import *
 from .calculate_tools import *
 from .file_tools import *
+from .pointclouds_tools import *
+from .grasp_refine import *
--- a/vision_detect/vision_detect/VisionDetect/utils/calculate_tools.py
+++ b/vision_detect/vision_detect/VisionDetect/utils/calculate_tools.py
@@ -1,18 +1,35 @@
 """计算工具"""
 import logging

-import cv2
+# import cv2
 import numpy as np
 import open3d as o3d
 import transforms3d as tfs

-from .object_icp import object_icp
+from .object_icp import *
+from .pointclouds_tools import *
+
+E2E_SIGN = True
+try:
+    import torch
+    import MinkowskiEngine as ME
+    import collections.abc as container_abcs
+    from ..net import pred_decode, ModelFreeCollisionDetector
+except (ImportError, OSError, RuntimeError):
+    logging.warning("ImportError or OSError or RuntimeError")
+    E2E_SIGN = False
+

 __all__ = [
    "calculate_pose_pca", "calculate_pose_icp",
-    "rmat2quat", "quat2rmat",
+    "rmat2quat", "quat2rmat", "calculate_pose_e2e"
 ]

+
+if not E2E_SIGN:
+    __all__.remove("calculate_pose_e2e")
+
+
 def pca(data: np.ndarray, sort=True):
    """主成分分析 """
    center = np.mean(data, axis=0)
@@ -40,30 +57,52 @@ def calculate_pose_pca(
        calculate_grab_width: bool = False,
        **kwargs
 ):
-    """点云主成分分析法计算位态"""
+    """点云主成分分析法计算位态
+    ----------
+    input:
+        mask: np.ndarray
+        depth_img: np.ndarray
+        intrinsics: o3d.camera.PinholeCameraIntrinsic
+        calculate_grab_width: bool
+        **kwargs:
+
+    output:
+        rmat: np.ndarray (4, 4)
+        grab_width: list
+        sign: bool
+    """
    depth_img_mask = np.zeros_like(depth_img)
    depth_img_mask[mask > 0] = depth_img[mask > 0]

-    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
+    point_cloud, sign = create_o3d_denoised_pcd(depth_img_mask, intrinsics, **kwargs)
+    if not sign:
+        return point_cloud, [], False

-    point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
-        depth=depth_o3d,
-        intrinsic=intrinsics,
-        depth_scale=kwargs.get("depth_scale", 1000.0),
-        depth_trunc=kwargs.get("depth_trunc", 3.0),
-    )
+    # depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
+    #
+    # point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
+    #     depth=depth_o3d,
+    #     intrinsic=intrinsics,
+    #     depth_scale=kwargs.get("depth_scale", 1000.0),
+    #     depth_trunc=kwargs.get("depth_trunc", 3.0),
+    # )
+    #
+    # point_cloud, sign = point_cloud_denoising(point_cloud, kwargs.get("voxel_size", 0.002))
+    # if not sign:
+    #     return 1100, [], False
+    #
+    # if len(point_cloud.points) == 0:
+    #     # logging.warning("point_cloud is empty")
+    #     return 1101, [], False

-    point_cloud = point_cloud_denoising(point_cloud, kwargs.get("voxel_size", 0.005))
-    if point_cloud is None:
-        return None, [0.0, 0.0, 0.0]
-
-    if len(point_cloud.points) == 0:
-        logging.warning("clean_pcd is empty")
-        return np.eye(4), [0.0, 0.0, 0.0]
+    center = point_cloud.get_center()
+    x, y, z = center

    if calculate_grab_width:
+        if np.asarray(point_cloud.points).shape[0] < 4:
+            # logging.warning("点数不足，不能算 OBB")
+            return 1200, [0.0, 0.0, 0.0], False
        obb = point_cloud.get_oriented_bounding_box()
-        x, y, z = obb.center
        extent = obb.extent
        order = np.argsort(-extent)

@@ -74,20 +113,16 @@ def calculate_pose_pca(
        v = v[:, order]

        if v is None:
-            logging.warning("PCA output v is None")
-            return np.eye(4), [0.0, 0.0, 0.0]
+            return 1201, [0.0, 0.0, 0.0], False

        grab_width = grab_width * 1.05

    else:
-        center = point_cloud.get_center()
-        x, y, z = center
-
        w, v = pca(np.asarray(point_cloud.points))

        if w is None or v is None:
-            logging.warning("PCA output w or v is None")
-            return np.eye(4), 0.0
+            return 1201, 0.0, False
+
        grab_width = [0.0, 0.0, 0.0]
    
    vx, vy, vz = v[:,0], v[:,1], v[:,2]
@@ -105,7 +140,7 @@ def calculate_pose_pca(
    # draw(point_cloud_u, rmat)
    # draw(point_cloud, rmat)

-    return rmat, grab_width
+    return rmat, grab_width, True


 def calculate_pose_icp(
@@ -119,22 +154,26 @@ def calculate_pose_icp(
    depth_img_mask = np.zeros_like(depth_img)
    depth_img_mask[mask > 0] = depth_img[mask > 0]

-    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
+    point_cloud, sign = create_o3d_denoised_pcd(depth_img_mask, intrinsics, **kwargs)
+    if not sign:
+        return point_cloud, [], False

-    point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
-        depth=depth_o3d,
-        intrinsic=intrinsics,
-        depth_scale=kwargs.get("depth_scale", 1000.0),
-        depth_trunc=kwargs.get("depth_trunc", 3.0)
-    )
-
-    point_cloud = point_cloud_denoising(point_cloud, kwargs.get("voxel_size", 0.010))
-    if point_cloud is None:
-        return None, [0.0, 0.0, 0.0]
-
-    if len(point_cloud.points) == 0:
-        logging.warning("clean_pcd is empty")
-        return np.eye(4), [0.0, 0.0, 0.0]
+    # depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
+    #
+    # point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
+    #     depth=depth_o3d,
+    #     intrinsic=intrinsics,
+    #     depth_scale=kwargs.get("depth_scale", 1000.0),
+    #     depth_trunc=kwargs.get("depth_trunc", 3.0)
+    # )
+    #
+    # point_cloud, sign = point_cloud_denoising(point_cloud, kwargs.get("voxel_size", 0.002))
+    # if not sign:
+    #     return 1100, [], False
+    #
+    # if len(point_cloud.points) == 0:
+    #     # logging.warning("clean_pcd is empty")
+    #     return 1101, [0.0, 0.0, 0.0], False

    if calculate_grab_width:
        pass
@@ -148,97 +187,190 @@ def calculate_pose_icp(
    )

    grab_width = [0.0, 0.0, 0.0]
-    return rmat, grab_width
+    return rmat, grab_width, True


-def point_cloud_denoising(point_cloud: o3d.geometry.PointCloud, voxel_size: float = 0.005):
-    """点云去噪"""
-    point_cloud = point_cloud.remove_non_finite_points()
-    while True:
-        down_pcd = point_cloud.voxel_down_sample(voxel_size=voxel_size * 0.5)
-        point_num = len(down_pcd.points)
-        if point_num <= 1000:
-            break
-        voxel_size *= 2
+if E2E_SIGN:
+    def minkowski_collate_fn(list_data):
+        coordinates_batch, features_batch = ME.utils.sparse_collate([d["coors"] for d in list_data],
+                                                                    [d["feats"] for d in list_data])
+        coordinates_batch = np.ascontiguousarray(coordinates_batch, dtype=np.int32)
+        coordinates_batch, features_batch, _, quantize2original = ME.utils.sparse_quantize(
+            coordinates_batch, features_batch, return_index=True, return_inverse=True)

-    # logging.fatal("point_cloud_denoising: point_num={}".format(len(point_cloud.points)))
-    # logging.fatal("point_cloud_denoising: point_num={}".format(point_num))
+        res = {
+            "coors": coordinates_batch,
+            "feats": features_batch,
+            "quantize2original": quantize2original,
+            "point_clouds": torch.stack([torch.from_numpy(b) for b in [d["point_clouds"] for d in list_data]], 0)
+        }
+        return res

-    # 半径滤波
-    clean_pcd, _ = down_pcd.remove_radius_outlier(
-        nb_points=10,
-        radius=voxel_size * 5
+
+    def calculate_pose_e2e(
+            mask,
+            depth_img: np.ndarray,
+            intrinsics,
+            calculate_grab_width: bool = False,
+            **kwargs
+    ):
+        """
+        点云抓取姿态预测模型计算位态
+        -----
+        input:
+            mask: np.ndarray
+            depth_img: np.ndarray
+            intrinsics: o3d.camera.PinholeCameraIntrinsic
+            calculate_grab_width: bool (abandon)
+            **kwargs:
+                "depth_scale": float
+                "depth_trunc": float
+                "voxel_size": float
+                "model":
+                "collision_thresh": float
+
+        """
+        # logging.error("stage 1")
+        depth_img_mask = np.zeros_like(depth_img)
+        depth_img_mask[mask > 0] = depth_img[mask > 0]
+
+        # 点云创建
+        depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
+        point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
+            depth=depth_o3d,
+            intrinsic=intrinsics,
+            depth_scale=kwargs.get("depth_scale", 1000.0),
+            depth_trunc=kwargs.get("depth_trunc", 3.0),
+        )
+        point_cloud = point_cloud.remove_non_finite_points()
+
+        # 点云去噪过程
+        down_pcd = point_cloud.voxel_down_sample(voxel_size=kwargs.get("voxel_size", 0.002))
+
+        clean_pcd, _ = down_pcd.remove_radius_outlier(
+            nb_points=max(int(round(10 * kwargs.get("voxel_size", 0.002) / 0.005)), 3),
+            radius=kwargs.get("voxel_size", 0.002) * 10
+        )
+        clean_pcd, _ = clean_pcd.remove_statistical_outlier(
+            nb_neighbors=max(int(round(10 * kwargs.get("voxel_size", 0.002) / 0.005)), 3),
+            std_ratio=2.0
+        )
+        points = np.asarray(clean_pcd.points)
+        clean_pcd.points = o3d.utility.Vector3dVector(points[points[:, 2] >= 0.2])
+
+        labels = np.array(
+            clean_pcd.cluster_dbscan(
+                eps=kwargs.get("voxel_size", 0.002) * 10,
+                min_points=max(int(round(10 * kwargs.get("voxel_size", 0.002) / 0.005)), 3)
+            )
        )

-    # 统计滤波
-    clean_pcd, _ = clean_pcd.remove_statistical_outlier(
-        nb_neighbors=10,
-        std_ratio=2.0
-        )
+        # 点云簇过滤
+        points = np.asarray(clean_pcd.points)
+        cluster_label = set(labels)
+        point_cloud_clusters = []
+        for label in cluster_label:
+            if label == -1:
+                continue
+            idx = np.where(labels == label)[0]
+            point_cloud_cluster = clean_pcd.select_by_index(idx)
+            points_cluster_z = points[idx, 2]
+            z_avg = np.mean(points_cluster_z)
+            if z_avg < 0.2:
+                continue
+            point_cloud_clusters.append((point_cloud_cluster, z_avg))

-    # 过滤过近的点
-    points = np.asarray(clean_pcd.points)
-    clean_pcd.points = o3d.utility.Vector3dVector(points[points[:, 2] >= 0.2])
+        point_cloud_clusters.sort(key=lambda x: x[1])
+        point_cloud = point_cloud_clusters[0][0]

-    # # 使用数量最大簇判定噪声强度
-    # _, counts = np.unique(labels[labels >= 0], return_counts=True)
-    # largest_cluster_ratio = counts.max() / len(labels)
-    # if largest_cluster_ratio < 0.5:
-    #     return None
+        # 判断点云是否为空
+        if len(point_cloud.points) == 0:
+            return 1101, [0.0, 0.0, 0.0], False

-    labels = np.array(clean_pcd.cluster_dbscan(eps=voxel_size * 5, min_points=10))
-    if len(labels[labels >= 0]) == 0:
-        return clean_pcd
+        # # 点云补齐15000个点
+        # points = np.asarray(point_cloud.points)
+        # if len(points) >= 15000:
+        #     idxs = np.random.choice(len(points), 15000, replace=False)
+        # else:
+        #     idxs1 = np.arange(len(points))
+        #     idxs2 = np.random.choice(len(points), 15000 - len(points), replace=True)
+        #     idxs = np.concatenate([idxs1, idxs2], axis=0)
+        # points = points[idxs]

-    # 使用距离最近簇作为物体
-    points = np.asarray(clean_pcd.points)
-    cluster_label = set(labels)
-    point_cloud_clusters = []
-    for label in cluster_label:
-        if label == -1:
-            continue
-        idx = np.where(labels == label)[0]
-        point_cloud_cluster = clean_pcd.select_by_index(idx)
-        points_cluster_z = points[idx, 2]
-        z_avg = np.mean(points_cluster_z)
-        if z_avg < 0.2:
-            continue
-        point_cloud_clusters.append((point_cloud_cluster, z_avg))
+        # 构建推理数据结构
+        ret_dict = {
+            'point_clouds': points.astype(np.float32),
+            'coors': points.astype(np.float32) / kwargs.get("voxel_size", 0.002),
+            'feats': np.ones_like(points).astype(np.float32),
+        }
+        batch_data = minkowski_collate_fn([ret_dict])

-    if len(point_cloud_clusters) == 0:
-        return clean_pcd
+        # 将数据置于对应的设备上
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        for key in batch_data:
+            batch_data[key] = batch_data[key].to(device)

-    point_cloud_clusters.sort(key=lambda x: x[1])
-    clean_pcd = point_cloud_clusters[0][0]
+        logging.warning(f'points num: {len(points)}')

-    # 使用最近簇判断噪音强度
-    largest_cluster_ratio = len(clean_pcd.points) / point_num
-    if largest_cluster_ratio < 0.20:
-        return None
+        # 点云数量判断，是否返回
+        if batch_data['coors'].shape[0] < 128:  # 例如 128 / 256
+            return 1300, [0.0], False
+        if batch_data["point_clouds"].shape[1] < 128:  # 例如 128 / 256
+            return 1301, [0.0], False

-    return clean_pcd
+        # 梯度置0，进入推理
+        with torch.no_grad():
+            end_points = kwargs.get("model")(batch_data)
+            if end_points is None:
+                return 1302, [0.0, 0.0, 0.0], False
+            grasp_preds = pred_decode(end_points)

+        # 推理结果后处理
+        preds = grasp_preds[0].detach().cpu().numpy()
+        sorted_index = np.argsort(-preds[:, 0])
+        preds = preds[sorted_index]
+        preds = preds[:10]
+
+        if kwargs.get("collision_thresh", 0.01) > 0:
+            cloud = kwargs.get("orign_point_clouds")
+            if cloud is not None and len(preds) > 0:
+                mfcdetector = ModelFreeCollisionDetector(
+                    cloud,
+                    voxel_size=kwargs.get("voxel_size", 0.002)
+                )
+                collision_mask = mfcdetector.detect(
+                    preds, approach_dist=0.05,
+                    collision_thresh=kwargs.get("collision_thresh", 0.01)
+                )
+                preds = preds[~collision_mask]
+
+        # logging.error("stage 8")
+        Rs = preds[:, 4:13].reshape(-1, 3, 3)
+        centers = preds[:, 13:16].reshape(-1, 3)
+        grab_width = preds[:, 1]
+
+        if not len(Rs):
+            return 1303, [0.0, 0.0, 0.0], False
+
+        # logging.error("stage 9")
+        rmat = []
+        for r, center in zip(Rs, centers):
+            vz, vx, vy = r[:, 0], r[:, 1], r[:, 2]
+            if vx[0] < 0:
+                vx = -vx
+            if vz[2] < 0:
+                vz = -vz
+            if not np.allclose(np.cross(vx, vy), vz):
+                vy = -vy
+
+            R = np.column_stack((vx, vy, vz))
+            rmat.append(tfs.affines.compose(np.squeeze(np.asarray(center)), R, [1, 1, 1]))
+
+        if len(rmat) == 0:
+            return 1304, [0.0, 0.0, 0.0], False
+
+        return rmat[0], [grab_width[0]], True

-# def calculate_grab_width(mask, k, depth):
-#     """计算重心宽度"""
-#     mask = mask.astype(np.uint8) * 255
-#     contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-#
-#     if not contours:
-#         return 0.0
-#
-#     c = max(contours, key=cv2.contourArea)
-#     box = cv2.boxPoints(cv2.minAreaRect(c))
-#     if np.linalg.norm(box[1] - box[0]) < np.linalg.norm(box[1] - box[2]):
-#         point_diff = box[1] - box[0]
-#     else:
-#         point_diff = box[1] - box[2]
-#
-#     grab_width = depth * np.sqrt(
-#         point_diff[0]**2 / k[0]**2 + point_diff[1]**2 / k[4]**2
-#         )
-#     return grab_width
-    

 def rmat2quat(rmat):
    """Convert rotation matrix to quaternion."""
--- a/vision_detect/vision_detect/VisionDetect/utils/draw_tools.py
+++ b/vision_detect/vision_detect/VisionDetect/utils/draw_tools.py
@@ -6,6 +6,7 @@ import cv2
 import numpy as np
 import open3d as o3d

+from .image_tools import save_img


 __all__ = [
@@ -16,7 +17,8 @@ def draw_box(
        rgb_img: np.ndarray,
        segmentation_result,
        put_text: bool = True,
-        save_path: Union[bool, str] = False
+        save_dir: Union[bool, str] = False,
+        mark_time: bool = False
 ):
    """
    绘制目标检测边界框
@@ -26,6 +28,12 @@ def draw_box(
    class_ids = segmentation_result.boxes.cls.cpu().numpy()
    labels = segmentation_result.names

+    x_centers, y_centers = boxes[:, 0], boxes[:, 1]
+    sorted_index = np.lexsort((-y_centers, x_centers))
+    boxes = boxes[sorted_index]
+    class_ids = class_ids[sorted_index]
+    confidences = confidences[sorted_index]
+
    for i, box in enumerate(boxes):
        x_center, y_center, width, height = box[:4]

@@ -41,15 +49,8 @@ def draw_box(
            cv2.putText(rgb_img, f"{i}", (p1[0] + 15, p1[1] + 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2)

-    if save_path:
-        if isinstance(save_path, str):
-            dir_path = os.path.dirname(save_path)
-            if dir_path:
-                os.makedirs(dir_path, exist_ok=True)
-        else:
-            home_path = os.path.expanduser("~")
-            save_path = os.path.join(home_path, "detect_color_img.png")
-        cv2.imwrite(save_path, rgb_img)
+    if save_dir:
+        save_img(rgb_img, "detect_color_img.png", save_dir, mark_time)


 def draw_mask(rgb_img: np.ndarray, segmentation_result):
--- a/vision_detect/vision_detect/VisionDetect/utils/grasp_refine.py
+++ b/vision_detect/vision_detect/VisionDetect/utils/grasp_refine.py
@@ -0,0 +1,282 @@
+import numpy as np
+import transforms3d as tfs
+# import open3d as o3d
+
+__all__ = ["refine_grasp_pose"]
+
+
+def collision_detector(
+        points: np.ndarray,
+        refine: np.ndarray,
+        volume: list[float],
+        iou: float = 0.001,
+        search_mode: bool = False,
+        **kwargs
+) -> int:
+    """
+    collision detector
+    -----
+    input:
+        points: np.ndarray (3, n)
+        refine: np.ndarray (3, ), Grab target poes coordinate system
+        volume: list [left, right]
+        iou : float
+        search_mode: bool, Default False
+        **kwargs:
+            "grab_width": float
+            "hand_size": list [width, height, length]
+            "left_size": list [thick, width, length]
+            "right_size": list [thick, width, length]
+            thick of gripper finger, width of gripper finger, length of gripper finger
+    output:
+        collision_code: int
+    """
+    hand_size = kwargs.get('hand_size', [0.113, 0.063, 0.13])
+    left_size = kwargs.get('left_size', [0.006, 0.037, 0.086])
+    right_size = kwargs.get('right_size', [0.006, 0.037, 0.086])
+    grab_width = kwargs.get('grab_width', 0.10) * 0.95
+    x, y, z = refine
+
+    if not search_mode:
+        hand_top_box = (
+            (points[2] < z) & (points[2] > (z - hand_size[2]))
+            & (points[0] < (x - hand_size[1]*1/4))  & (points[0] > (x - hand_size[1]/2))
+            & (points[1] < (y + hand_size[0]/2)) & (points[1] > (y - hand_size[0]/2))
+        )
+        hand_center_box = (
+            (points[2] < z) & (points[2] > (z - hand_size[2]))
+            & (points[0] < (x + hand_size[1]*1/4))  & (points[0] > (x - hand_size[1]*1/4))
+            & (points[1] < (y + hand_size[0]/2)) & (points[1] > (y - hand_size[0]/2))
+        )
+        hand_bottom_box = (
+            (points[2] < z) & (points[2] > (z - hand_size[2]))
+            & (points[0] < (x + hand_size[1]/2)) & (points[0] > (x + hand_size[1]*1/4))
+            & (points[1] < (y + hand_size[0]/2)) & (points[1] > (y - hand_size[0]/2))
+        )
+
+        if (hand_top_box.any() and hand_bottom_box.any()) or hand_center_box.any():
+            return 1
+        if hand_bottom_box.any():
+            return 2
+        if hand_top_box.any():
+            return 3
+
+    left_finger_box = (
+        (points[2] < (z + left_size[2])) & (points[2] > z -0.05)
+        & (points[0] < (x + left_size[1]/2)) & (points[0] > (x - left_size[1]/2))
+        & (points[1] < (y + grab_width/2 + left_size[0])) & (points[1] > (y + grab_width/2))
+    )
+    right_finger_box = (
+        (points[2] < (z + right_size[2])) & (points[2] > z - 0.05)
+        & (points[0] < (x + right_size[1]/2)) & (points[0] > (x - right_size[1]/2))
+        & (points[1] < (y - grab_width/2)) & (points[1] > (y-(grab_width/2 + right_size[0])))
+    )
+
+    left_iou = left_finger_box.sum() / (volume[0]+1e-6)
+    right_iou = right_finger_box.sum() / (volume[1]+1e-6)
+
+    if left_iou > iou and right_iou > iou:
+        return 6
+
+    if left_iou > iou :
+        return 4
+    if right_iou > iou :
+        return 5
+
+    return 0
+
+
+def refine_grasp_pose(
+        points: np.ndarray,
+        voxel_size: float,
+        target_position: np.ndarray,
+        expect_orientation: np.ndarray | None = None,
+        search_mode: bool = False,
+        **kwargs
+):
+    """
+    refine grasp pose
+    -----
+    input:
+        points: np.ndarray (n, 3), already complete voxel downsample
+        voxel_size: float
+        target_position: np.ndarray (3, ), grab target position
+        expect_orientation: np.ndarray (3, 3), expect grab target orientation
+        search_mode: bool, Default False
+        **kwargs:
+            "grab_width": float
+            "hand_size": list [width, height, length]
+            "left_size": list [thick, width, length]
+            "right_size": list [thick, width, length]
+            thick of gripper finger, width of gripper finger, length of gripper finger
+    output:
+    """
+    if expect_orientation is None:
+        expect_orientation = np.asarray([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+
+    refine = np.zeros(3)
+    grab_width = kwargs.get('grab_width', 0.10)
+    hand_size = kwargs.get('hand_size', [0.113, 0.063, 0.13])
+    left_size = kwargs.get('left_size', [0.006, 0.037, 0.086])
+    right_size = kwargs.get('right_size', [0.006, 0.037, 0.086])
+
+    left_volume = left_size[0] * left_size[1] * left_size[2] / (voxel_size**3)
+    right_volume = right_size[0] * right_size[1] * right_size[2] / (voxel_size**3)
+
+    points = points - target_position[None, :] # (n, 3) - (1, 3) = (n, 3)
+    points = expect_orientation.T @ points.T # (3, 3) @ (3, n) = (3, n)
+
+    points = points[:,
+        (points[2] < left_size[2]) & (points[2] > -(0.1 + hand_size[2]))
+        & (points[0] < (0.05 + hand_size[1]/2)) & (points[0] > -(0.05 + hand_size[1]/2))
+        & (points[1] < (0.05 + grab_width/2 + left_size[0]))
+        & (points[1] > -(0.05 + grab_width/2 + right_size[0]))
+    ]
+
+    frist_sign = False
+    y_n = y_p = False
+    left_last = right_last = False
+    hand_num = left_num = right_num = 0
+    while hand_num < 20 and left_num < 10 and right_num < 10:
+        collision_code = collision_detector(
+            points, refine, volume=[left_volume, right_volume], **kwargs
+        )
+
+        if collision_code == 0:
+            if y_p and not y_n:
+                refine[1] += 0.004
+                y_p = y_n = False
+                left_last = True
+                right_last = False
+                continue
+            if not y_p and y_n:
+                refine[1] -= 0.004
+                y_p = y_n = False
+                left_last = False
+                right_last = True
+                continue
+            if search_mode:
+                break
+            position = target_position + (expect_orientation @ refine.T).T
+            rmat = tfs.affines.compose(
+                np.squeeze(np.asarray(position)), expect_orientation, [1, 1, 1]
+            )
+            return rmat, True
+        if collision_code == 6:
+            return 1202, False
+
+        if collision_code == 1:
+            refine[2] -= 0.008
+            hand_num += 1
+            print("z + 0.008")
+            continue
+
+        if collision_code == 2:
+            if frist_sign:
+                print("z + 0.004")
+                refine[2] -= 0.004
+                frist_sign = True
+                continue
+            refine[0] -= 0.002
+            hand_num += 1
+            print("x - 0.002")
+            continue
+
+        if collision_code == 3:
+            if frist_sign:
+                print("z + 0.004")
+                refine[2] -= 0.004
+                frist_sign = True
+                continue
+            refine[0] += 0.002
+            hand_num += 1
+            print("x + 0.002")
+            continue
+
+        if collision_code == 4:
+            y_p = True
+            y_n = False
+            refine[1] += 0.004
+            left_num += 1
+            print("y + 0.004")
+            continue
+
+        if collision_code == 5:
+            y_p = False
+            y_n = True
+            refine[1] -= 0.004
+            right_num += 1
+            print("y - 0.004")
+            continue
+    else:
+        return 1202, False
+
+    if search_mode:
+        right_num = left_num = 0
+        if left_last and not right_last:
+            y_min = refine[1]
+            while left_num < 10:
+                left_num += 1
+                refine[1] += 0.004
+                collision_code = collision_detector(
+                    points, refine, volume=[left_volume, right_volume], search_mode=True, **kwargs
+                )
+                if collision_code:
+                    refine[1] = (refine[1] - 0.004 + y_min) / 2
+                    break
+            else:
+                refine[1] = y_min + 0.2
+            print(f"left_num = {left_num}")
+
+        elif not left_last and right_last:
+            y_max = refine[1]
+            while right_num < 10:
+                right_num += 1
+                refine[1] -= 0.004
+                collision_code = collision_detector(
+                    points, refine, volume=[left_volume, right_volume], search_mode=True, **kwargs
+                )
+                if collision_code:
+                    refine[1] = (refine[1] + 0.004 + y_max) / 2
+                    break
+            else:
+                refine[1] = y_max - 0.2
+            print(f"right_num = {right_num}")
+
+        elif not left_last and not right_last:
+            y_cur = refine[1]
+            while left_num < 6:
+                left_num += 1
+                refine[1] += 0.004
+                collision_code = collision_detector(
+                    points, refine, volume=[left_volume, right_volume], search_mode=True, **kwargs
+                )
+                if collision_code:
+                    y_max = refine[1] - 0.004
+                    break
+            else:
+                y_max = refine[1] + 0.024
+            print(f"left_num = {left_num}")
+
+            refine[1] = y_cur
+            while right_num < 6:
+                right_num += 1
+                refine[1] -= 0.004
+                collision_code = collision_detector(
+                    points, refine, volume=[left_volume, right_volume], search_mode=True, **kwargs
+                )
+                if collision_code:
+                    refine[1] = (refine[1] + 0.004 + y_max) / 2
+                    break
+            else:
+                refine[1] = (y_cur - 0.024 + y_max) / 2
+            print(f"right_num = {right_num}")
+
+        position = target_position + (expect_orientation @ refine.T).T
+        rmat = tfs.affines.compose(
+            np.squeeze(np.asarray(position)),
+            expect_orientation,
+            [1, 1, 1]
+        )
+        return rmat, True
+    return 1202, False
--- a/vision_detect/vision_detect/VisionDetect/utils/image_tools.py
+++ b/vision_detect/vision_detect/VisionDetect/utils/image_tools.py
@@ -1,3 +1,6 @@
+import os
+import time
+
 import cv2
 import logging
 import numpy as np
@@ -5,7 +8,7 @@ import numpy as np

 __all__ = [
    "crop_imgs_box_xywh", "crop_imgs_box_xyxy", "crop_imgs_mask", "get_map",
-    "distortion_correction",
+    "distortion_correction", "save_img"
 ]

 def crop_imgs_box_xywh(imgs: list, box, same_sign: bool = False):
@@ -154,3 +157,28 @@ def distortion_correction(
    return undistorted_color, undistorted_depth


+def save_img(img: np.ndarray, save_name, save_dir: str | None = None, mark_cur_time: bool = False):
+    """
+    保存图像
+
+    input:
+        img: np.ndarray
+            - uint8, BGR (H, W, 3)
+            - uint16 single-channel
+        save_name: str
+        save_path: str | None
+        mark_cur_time: bool
+    """
+    if isinstance(save_dir, str):
+        os.makedirs(save_dir, exist_ok=True)
+        save_path = os.path.join(save_dir, save_name)
+    else:
+        home_path = os.path.expanduser("~")
+        save_path = os.path.join(home_path, save_name)
+
+    if mark_cur_time:
+        cur_time = int(time.time() * 1000)
+        path, ext = os.path.splitext(save_path)
+        save_path = path + f'_{cur_time}' + ext
+
+    cv2.imwrite(save_path, img)
--- a/vision_detect/vision_detect/VisionDetect/utils/pointclouds_tools.py
+++ b/vision_detect/vision_detect/VisionDetect/utils/pointclouds_tools.py
@@ -0,0 +1,152 @@
+# import logging
+#
+# import cv2
+import numpy as np
+import open3d as o3d
+
+
+__all__ = ["create_o3d_pcd", "create_o3d_denoised_pcd"]
+
+
+def point_cloud_denoising(point_cloud: o3d.geometry.PointCloud, voxel_size: float = 0.005):
+    """点云去噪"""
+    point_cloud = point_cloud.remove_non_finite_points()
+    down_pcd = point_cloud.voxel_down_sample(voxel_size=voxel_size)
+    point_num = len(down_pcd.points)
+
+    # 半径滤波
+    clean_pcd, _ = down_pcd.remove_radius_outlier(
+        nb_points=max(int(round(10 * voxel_size / 0.005)), 3),
+        radius=voxel_size * 10
+        )
+
+    # 统计滤波
+    clean_pcd, _ = clean_pcd.remove_statistical_outlier(
+        nb_neighbors=max(int(round(10 * voxel_size / 0.005)), 3),
+        std_ratio=2.0
+        )
+
+    # 过滤过近的点
+    points = np.asarray(clean_pcd.points)
+    clean_pcd.points = o3d.utility.Vector3dVector(points[points[:, 2] >= 0.2])
+
+    # # 使用数量最大簇判定噪声强度
+    # _, counts = np.unique(labels[labels >= 0], return_counts=True)
+    # largest_cluster_ratio = counts.max() / len(labels)
+    # if largest_cluster_ratio < 0.5:
+    #     return None
+
+    labels = np.array(
+        clean_pcd.cluster_dbscan(
+            eps=voxel_size * 10,
+            min_points=max(int(round(10 * voxel_size / 0.005)), 3)
+        )
+    )
+    if len(labels[labels >= 0]) == 0:
+        return clean_pcd, True
+
+    # 使用距离最近簇作为物体
+    points = np.asarray(clean_pcd.points)
+    cluster_label = set(labels)
+    point_cloud_clusters = []
+    for label in cluster_label:
+        if label == -1:
+            continue
+        idx = np.where(labels == label)[0]
+        point_cloud_cluster = clean_pcd.select_by_index(idx)
+        points_cluster_z = points[idx, 2]
+        z_avg = np.mean(points_cluster_z)
+        if z_avg < 0.2:
+            continue
+        point_cloud_clusters.append((point_cloud_cluster, z_avg))
+
+    if len(point_cloud_clusters) == 0:
+        return clean_pcd
+
+    point_cloud_clusters.sort(key=lambda x: x[1])
+    clean_pcd = point_cloud_clusters[0][0]
+
+    # 使用最近簇判断噪音强度
+    largest_cluster_ratio = len(clean_pcd.points) / point_num
+    if largest_cluster_ratio < 0.05:
+        return 1100, False
+
+    return clean_pcd, True
+
+
+def create_o3d_pcd(depth_img, camera_size, k, **kwargs):
+    """
+    create o3d pcd
+    --------
+    input:
+        depth_img: np.ndarray
+        camera_size: list
+        k: np.ndarray | list
+        **kwargs:
+            "depth_scale": float
+            "depth_trunc": float
+            "voxel_size": float
+    output:
+        orign_points: o3d.geometry.PointCloud
+    """
+    intrinsics = o3d.camera.PinholeCameraIntrinsic(
+        int(camera_size[0]),
+        int(camera_size[1]),
+        k[0], k[4], k[2], k[5]
+    )
+    depth_o3d = o3d.geometry.Image(depth_img.astype(np.uint16))
+    orign_point_clouds = o3d.geometry.PointCloud.create_from_depth_image(
+        depth=depth_o3d,
+        intrinsic=intrinsics,
+        depth_scale=kwargs.get("depth_scale", 1000.0),
+        depth_trunc=kwargs.get("depth_trunc", 3.0),
+    )
+    orign_point_clouds = orign_point_clouds.voxel_down_sample(
+        kwargs.get("voxel_size", 0.01)
+    )
+    # # 半径滤波
+    # orign_point_clouds, _ = orign_point_clouds.remove_radius_outlier(
+    #     nb_points=int(round(10 * kwargs.get("voxel_size", 0.01) / 0.005)),
+    #     radius=kwargs.get("voxel_size", 0.01) * 10
+    # )
+    #
+    # # 统计滤波
+    # orign_point_clouds, _ = orign_point_clouds.remove_statistical_outlier(
+    #     nb_neighbors=int(round(20 * kwargs.get("voxel_size", 0.01) / 0.005)),
+    #     std_ratio=2.0
+    # )
+    orign_points = np.asarray(orign_point_clouds.points)
+
+    return orign_points
+
+
+def create_o3d_denoised_pcd(depth_img_mask, intrinsics, **kwargs):
+    """
+    create o3d denoised pcd
+    --------
+    input:
+        depth_img_mask: np.ndarray
+        intrinsics: o3d.camera.PinholeCameraIntrinsic
+        **kwargs:
+            "depth_scale": float
+            "depth_trunc": float
+            "voxel_size": float
+    output:
+        point_cloud: o3d.geometry.PointCloud
+    """
+    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
+    point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
+        depth=depth_o3d,
+        intrinsic=intrinsics,
+        depth_scale=kwargs.get("depth_scale", 1000.0),
+        depth_trunc=kwargs.get("depth_trunc", 3.0),
+    )
+
+    point_cloud, sign = point_cloud_denoising(point_cloud, kwargs.get("voxel_size", 0.002))
+    if not sign:
+        return 1100, False
+
+    if len(point_cloud.points) == 0:
+        return 1101, False
+
+    return point_cloud, True
--- a/vision_detect/vision_detect/calculate.py
+++ b/vision_detect/vision_detect/calculate.py
@@ -1,260 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-import transforms3d as tfs
-import numpy as np
-import math
-import json
-
-import rclpy
-from rclpy.node import Node
-
-
-def get_matrix_quat(x, y, z, rw, rx, ry, rz):
-    """从单位四元数构建齐次变换矩阵"""
-    '''构造旋转矩阵'''
-    q = [rw, rx, ry, rz]
-    rmat = tfs.quaternions.quat2mat(q)
-
-    """构造齐次变换矩阵"""
-    rmat = tfs.affines.compose(
-        np.squeeze(np.asarray((x, y, z))),
-        rmat,
-        [1, 1, 1]
-    )
-    return rmat
-
-
-def get_matrix_eular_radu(x, y, z, rx, ry, rz):
-    """从欧拉角构建齐次变换矩阵"""
-    '''构造旋转矩阵'''
-    rmat = tfs.euler.euler2mat(
-        # math.radians(rx), math.radians(ry), math.radians(rz)
-        rx, ry, rz
-    )
-
-    """构造齐次变换矩阵"""
-    rmat = tfs.affines.compose(
-        np.squeeze(np.asarray((x, y, z))),
-        rmat,
-        [1, 1, 1]
-    )
-    return rmat
-
-
-def get_matrix_rotvector(x, y, z, rx, ry, rz):
-    """从旋转向量构建齐次变换矩阵"""
-    '''构造旋转矩阵'''
-    rvec = np.array([rx, ry, rz])
-    theta = np.linalg.norm(rvec)
-    if theta < 1e-8:
-        rmat = np.eye(3)  # 小角度直接返回单位矩阵
-    else:
-        axis = rvec / theta
-        rmat = tfs.axangles.axangle2mat(axis, theta)
-
-    """构造齐次变换矩阵"""
-    rmat = tfs.affines.compose(
-        np.squeeze(np.asarray((x, y, z))),
-        rmat,
-        [1, 1, 1]
-    )
-    return rmat
-
-
-def skew(v):
-    return np.array([[0, -v[2], v[1]],
-                     [v[2], 0, -v[0]],
-                     [-v[1], v[0], 0]])
-
-
-def R2P(T):
-    """旋转矩阵 --> 修正罗德里格斯向量"""
-    axis, angle= tfs.axangles.mat2axangle(T[0:3, 0:3])
-    P = 2 * math.sin(angle / 2) * axis
-    return P
-
-
-def P2R(P):
-    """修正罗德里格斯向量 --> 旋转矩阵"""
-    P2 = np.dot(P.T, P)
-    part_1 = (1 - P2 / 2) * np.eye(3)
-    part_2 = np.add(np.dot(P, P.T), np.sqrt(4- P2) * skew(P.flatten().T))
-    R = np.add(part_1, np.divide(part_2, 2))
-    return R
-
-
-def calculate(Hgs, Hcs):
-    """计算标定矩阵"""
-    # H代表矩阵、h代表标量
-    Hgijs = []
-    Hcijs = []
-    A = []
-    B = []
-    size = 0
-    for i in range(len(Hgs)):
-        for j in range(i + 1, len(Hgs)):
-            size += 1
-            Hgij = np.dot(np.linalg.inv(Hgs[j]), Hgs[i])
-            Hgijs.append(Hgij)
-            Pgij = np.dot(2, R2P(Hgij))
-
-            Hcij = np.dot(Hcs[j], np.linalg.inv(Hcs[i]))
-            Hcijs.append(Hcij)
-            Pcij = np.dot(2, R2P(Hcij))
-
-            A.append(skew(np.add(Pgij, Pcij)))
-            B.append(np.subtract(Pcij, Pgij).reshape(3, 1))
-
-    MA = np.vstack(A)
-    MB = np.vstack(B)
-    Pcg_ = np.dot(np.linalg.pinv(MA), MB)
-    pcg = np.sqrt(np.add(1, np.dot(Pcg_.T, Pcg_)))
-    Pcg = np.dot(np.dot(2, Pcg_), np.linalg.inv(pcg))
-    Rcg = P2R(Pcg).reshape(3, 3)
-
-    A = []
-    B = []
-    id = 0
-    for i in range(len(Hgs)):
-        for j in range(i + 1, len(Hgs)):
-            Hgij = Hgijs[id]
-            Hcij = Hcijs[id]
-            A.append(np.subtract(Hgij[0:3, 0:3], np.eye(3, 3)))
-            B.append(np.subtract(np.dot(Rcg, Hcij[0:3, 3:4]), Hgij[0:3, 3:4]))
-            id += 1
-
-    MA = np.asarray(A).reshape(size * 3, 3)
-    MB = np.asarray(B).reshape(size * 3, 1)
-    Tcg = np.dot(np.linalg.pinv(MA), MB).reshape(3, )
-
-    return tfs.affines.compose(Tcg, np.squeeze(Rcg), [1, 1, 1])
-
-
-class Calibration(Node):
-    def __init__(self, name):
-        super(Calibration, self).__init__(name)
-        self.sync_subscriber = None
-        self.sub_camera_pose = None
-        self.sub_hand_pose = None
-        self.Hgs, self.Hcs = [], []
-        self.hand, self.camera = [], []
-        self.calibration_matrix = None
-
-        self.declare_parameter('matrix_name', 'eye_to_hand')
-        self.matrix_name = self.get_parameter('matrix_name').value
-
-        self.declare_parameter('mode', 'eye_to_hand')
-        self.mode = self.get_parameter('mode').value.lower()
-
-        if self.mode not in ['eye_in_hand', 'eye_to_hand']:
-            raise ValueError("mode must be 'eye_in_hand' or 'eye_to_hand'")
-
-        self.declare_parameter('input', 'quat')
-        self.input = self.get_parameter('input').value.lower()
-        if self.input == 'eular':
-            self.function = get_matrix_eular_radu
-        elif self.input == 'rotvertor':
-            self.function = get_matrix_rotvector
-        elif self.input == 'quat':
-            self.function = get_matrix_quat
-        else:
-            raise ValueError("INPUT ERROR")
-        
-        self.declare_parameter('camera_pose_path', 'camera_pose_data.json')
-        self.declare_parameter('hand_pose_path', 'hand_pose_data.json')
-        self.camera_pose_path = self.get_parameter('camera_pose_path').value
-        self.hand_pose_path = self.get_parameter('hand_pose_path').value
-
-        self.get_pose()
-        
-        self.done = False
-
-    def get_pose(self):
-        with open(f'{self.camera_pose_path}', 'r', encoding='utf-8') as f:
-            self.camera = json.load(f)
-
-        with open(f'{self.hand_pose_path}', 'r', encoding='utf-8') as f:
-            self.hand = json.load(f)
-
-        self.calculate_calibration()
-
-        print(self.hand)
-        print(self.camera)
-
-        self.get_logger().info(f"{self.calibration_matrix}")
-        hand_eye_result = {
-            "T": self.calibration_matrix.tolist()
-        }
-        with open(f"{self.matrix_name}_matrix.json", "w") as f:
-            json.dump(hand_eye_result, f, indent=4)
-        self.get_logger().info(f"Save as {self.matrix_name}_matrix.json")
-
-        self.done = True
-
-    def calculate_data(self):
-        if self.input == 'quat':
-            for i in range(0, len(self.hand), 7):
-                self.Hgs.append(
-                    np.linalg.inv(
-                        self.function(
-                            self.hand[i], self.hand[i + 1], self.hand[i + 2],
-                            self.hand[i + 3], self.hand[i + 4], self.hand[i + 5], self.hand[i + 6]
-                        )
-                    )
-                    if self.mode == 'eye_to_hand' else
-                    self.function(
-                        self.hand[i], self.hand[i + 1], self.hand[i + 2],
-                        self.hand[i + 3], self.hand[i + 4], self.hand[i + 5], self.hand[i + 6]
-                    )
-                )
-                self.Hcs.append(
-                    self.function(
-                        self.camera[i], self.camera[i + 1], self.camera[i + 2],
-                        self.camera[i + 3], self.camera[i + 4], self.camera[i + 5], self.camera[i + 6]
-                    )
-                )
-
-        else:
-            for i in range(0, len(self.hand), 6):
-                self.Hgs.append(
-                    np.linalg.inv(
-                        self.function(
-                            self.hand[i], self.hand[i + 1], self.hand[i + 2],
-                            self.hand[i + 3], self.hand[i + 4], self.hand[i + 5]
-                        )
-                    )
-                    if self.mode == 'eye_to_hand' else
-                    self.function(
-                        self.hand[i], self.hand[i + 1], self.hand[i + 2],
-                        self.hand[i + 3], self.hand[i + 4], self.hand[i + 5]
-                    )
-                )
-                self.Hcs.append(
-                    self.function(
-                        self.camera[i], self.camera[i + 1], self.camera[i + 2],
-                        self.camera[i + 3], self.camera[i + 4], self.camera[i + 5]
-                    )
-                )
-
-
-    def calculate_calibration(self):
-        self.calculate_data()
-        self.calibration_matrix = calculate(self.Hgs, self.Hcs)
-
-
-def main(args=None):
-    rclpy.init(args=args)
-    node = Calibration('calibration')
-    try:
-        while rclpy.ok() and not node.done:
-            rclpy.spin_once(node)
-    except KeyboardInterrupt:
-        pass
-    finally:
-        node.destroy_node()
-        rclpy.shutdown()
-
-
-if __name__ == '__main__':
-    main()
-
--- a/vision_detect/vision_detect/crossboard_detect.py
+++ b/vision_detect/vision_detect/crossboard_detect.py
@@ -1,323 +0,0 @@
-import os
-from collections import defaultdict
-from ament_index_python.packages import get_package_share_directory
-
-import cv2
-import open3d as o3d
-import numpy as np
-import transforms3d as tfs
-from cv_bridge import CvBridge
-
-import rclpy
-from rclpy.node import Node
-from message_filters import ApproximateTimeSynchronizer, Subscriber
-
-from sensor_msgs.msg import Image, CameraInfo
-from geometry_msgs.msg import Pose, Point, Quaternion
-from interfaces.msg import PoseClassAndID, PoseArrayClassAndID
-
-share_dir = get_package_share_directory('vision_detect')
-
-
-def get_map(K, D, camera_size):
-    h, w = camera_size[::-1]
-    K = np.array(K).reshape(3, 3)
-    D = np.array(D)
-    new_K, _ = cv2.getOptimalNewCameraMatrix(K, D, (w, h), 1, (w, h))
-    map1, map2 = cv2.initUndistortRectifyMap(K, D, None, new_K, (w, h), cv2.CV_32FC1)
-
-    return map1, map2, new_K.flatten()
-
-
-def pca(data, sort=True):
-    center = np.mean(data, axis=0)
-    centered_points = data - center    # 去中心化
-
-    try:
-        cov_matrix = np.cov(centered_points.T)  # 转置
-        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
-    except np.linalg.LinAlgError:
-        return None, None
-
-    if sort:
-        sort = eigenvalues.argsort()[::-1]  # 降序排列
-        eigenvalues = eigenvalues[sort]  # 索引
-        eigenvectors = eigenvectors[:, sort]
-
-    return eigenvalues, eigenvectors
-
-def calculate_pose_cpu(mask, depth_img: np.ndarray, intrinsics):
-    """计算位态"""
-    depth_img_mask = np.zeros_like(depth_img)
-    depth_img_mask[mask > 0] = depth_img[mask > 0]
-
-    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
-
-    point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
-        depth=depth_o3d,
-        intrinsic=intrinsics,
-        depth_scale=1000.0,
-        depth_trunc=2,
-    )
-
-    point_cloud = point_cloud.remove_non_finite_points()
-
-    down_pcd = point_cloud.voxel_down_sample(voxel_size=0.02)
-    clean_pcd, _ = down_pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=3.0)
-
-    if len(clean_pcd.points) == 0:
-        return 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
-    center = clean_pcd.get_center()
-    x, y, z = center
-
-    w, v = pca(np.asarray(clean_pcd.points))
-
-    if w is not None and v is not None:
-        vx, vy, vz = v[:,0], v[:,1], v[:,2]
-
-        if vx[0] < 0:
-            vx = -vx
-        if vy[1] < 0:
-            vy = -vy
-        if not np.allclose(np.cross(vx, vy), vz):
-            vz = -vz
-
-        R = np.column_stack((vx, vy, vz))
-        rmat = tfs.affines.compose(np.squeeze(np.asarray((x, y, z))), R, [1, 1, 1])
-
-        rw, rx, ry, rz = tfs.quaternions.mat2quat(rmat[0:3, 0:3])
-
-        # point = [
-        #     [x, y, z], [x, y, z] + vx, [x, y, z] + vy, [x, y, z] + vz,
-        #     [0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]
-        #     ]  # 画点：原点、第一主成分、第二主成分
-        # lines = [
-        #     [0, 1], [0, 2], [0, 3], [4, 5], [4, 6], [4, 7]
-        # ]  # 画出三点之间两两连线
-        # colors = [
-        #     [1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0], [0, 0, 1]
-        # ]
-        # # 构造open3d中的LineSet对象，用于主成分显示
-        # line_set = o3d.geometry.LineSet(points=o3d.utility.Vector3dVector(point), lines=o3d.utility.Vector2iVector(lines))
-        # line_set.colors = o3d.utility.Vector3dVector(colors)
-        # o3d.visualization.draw_geometries([point_cloud, line_set])
-        # o3d.visualization.draw_geometries([clean_pcd, line_set])
-
-        return x, y, z, rw, rx, ry, rz
-
-    else:
-        return 0.0, 0.0, 0.0, None, None, None, None
-
-
-def distortion_correction(color_image, depth_image, map1, map2):
-    undistorted_color = cv2.remap(color_image, map1, map2, cv2.INTER_LINEAR)
-    undistorted_color = undistorted_color.astype(color_image.dtype)
-
-    undistorted_depth = cv2.remap(depth_image, map1, map2, cv2.INTER_NEAREST)
-    undistorted_depth = undistorted_depth.astype(depth_image.dtype)
-
-    return undistorted_color, undistorted_depth
-
-
-def crop_mask_bbox(depth_img, mask):
-    """
-    输入:
-        depth_img: H x W
-        mask: H x W (0/1 或 bool)
-    输出:
-        depth_crop, mask_crop
-    """
-    high, width = depth_img.shape
-    ys, xs = np.where(mask > 0)
-
-    x_min, x_max = int(round(xs.min())), int(round(xs.max()))
-    y_min, y_max = int(round(ys.min())), int(round(ys.max()))
-
-    depth_crop = depth_img[max(0, y_min):min(y_max, high) + 1, max(0, x_min):min(x_max, width) + 1]
-    mask_crop = mask[max(0, y_min):min(y_max, high) + 1, max(0, x_min):min(x_max, width) + 1]
-
-    return depth_crop, mask_crop, (max(0, x_min), max(0, y_min))
-
-
-class DetectNode(Node):
-    def __init__(self, name):
-        super().__init__(name)
-        self.output_boxes = None
-        self.output_masks = None
-        self.K = self.D = None
-        self.map1 = self.map2 = None
-        self.intrinsics = None
-
-        self.function = self._test_image
-        self.calculate_function = calculate_pose_cpu
-        self.cv_bridge = CvBridge()
-
-        '''init'''
-        self._init_param()
-        self._init_publisher()
-        self._init_subscriber()
-
-    def _init_param(self):
-        """init parameter"""
-        self.declare_parameter('output_boxes', True)
-        self.output_boxes = self.get_parameter('output_boxes').value
-
-        self.declare_parameter('output_masks', False)
-        self.output_masks = self.get_parameter('output_masks').value
-
-        self.declare_parameter('color_image_topic', '/camera/color/image_raw')
-        self.color_image_topic = self.get_parameter('color_image_topic').value
-
-        self.declare_parameter('depth_image_topic', '/camera/depth/image_raw')
-        self.depth_image_topic = self.get_parameter('depth_image_topic').value
-
-        self.declare_parameter('camera_info_topic', '/camera/color/camera_info')
-        self.camera_info_topic = self.get_parameter('camera_info_topic').value
-
-    def _init_publisher(self):
-        """init_publisher"""
-        self.pub_pose_list = self.create_publisher(PoseArrayClassAndID, '/pose/cv_detect_pose', 10)
-
-        if self.output_boxes or self.output_masks:
-            self.pub_detect_image = self.create_publisher(Image, '/image/detect_image', 10)
-
-    def _init_subscriber(self):
-        """init_subscriber"""
-        self.sub_camera_info = self.create_subscription(
-            CameraInfo,
-            self.camera_info_topic,
-            self._camera_info_callback,
-            10
-        )
-
-        '''sync get color and depth img'''
-        self.sub_color_image = Subscriber(self, Image, self.color_image_topic)
-        self.sub_depth_image = Subscriber(self, Image, self.depth_image_topic)
-
-        self.sync_subscriber = ApproximateTimeSynchronizer(
-            [self.sub_color_image, self.sub_depth_image],
-            queue_size=10,
-            slop=0.1
-        )
-        self.sync_subscriber.registerCallback(self._sync_callback)
-
-    def _camera_info_callback(self, msg: CameraInfo):
-        """Get camera info"""
-        self.K = msg.k
-        self.D = msg.d
-
-        self.camera_size = [msg.width, msg.height]
-
-        if self.K is not None and self.D is not None:
-            self.map1, self.map2, self.K = get_map(msg.k, msg.d, self.camera_size)
-
-            if len(self.D) != 0:
-                self.destroy_subscription(self.sub_camera_info)
-            else:
-                self.D = [0, 0, 0, 0, 0]
-                self.destroy_subscription(self.sub_camera_info)
-        else:
-            raise "K and D are not defined" 
-
-
-    def _sync_callback(self, color_img_ros, depth_img_ros):
-        """同步回调函数"""
-        color_img_cv = self.cv_bridge.imgmsg_to_cv2(color_img_ros, "bgr8")
-        depth_img_cv = self.cv_bridge.imgmsg_to_cv2(depth_img_ros, '16UC1')
-
-        color_img_cv, depth_img_cv = distortion_correction(color_img_cv, depth_img_cv, self.map1, self.map2)
-
-        img, pose_dict = self.function(color_img_cv, depth_img_cv)
-
-        """masks为空，结束这一帧"""
-        if img is None:
-            img = self.cv_bridge.cv2_to_imgmsg(color_img_cv, "bgr8")
-
-        if self.output_boxes or self.output_masks:
-            self.pub_detect_image.publish(img)
-
-        if pose_dict:
-            pose_list_all = PoseArrayClassAndID()
-            for (class_id, class_name), pose_list in pose_dict.items():
-                pose_list_all.objects.append(
-                    PoseClassAndID(
-                        class_name = class_name,
-                        class_id = class_id,
-                        pose_list = pose_list
-                    )
-                )
-            pose_list_all.header.stamp = self.get_clock().now().to_msg()
-            pose_list_all.header.frame_id = "pose_list"
-            self.pub_pose_list.publish(pose_list_all)
-
-    def _test_image(self, rgb_img, depth_img):
-        pose_dict = defaultdict(list)
-        rgb_img_gray = cv2.cvtColor(rgb_img, cv2.COLOR_BGR2GRAY)
-        pattern_size = (8, 5)
-        # pattern_size = (15, 7)
-        ret, corners = cv2.findChessboardCorners(rgb_img_gray, pattern_size, cv2.CALIB_CB_FAST_CHECK)
-        if ret:
-            # 角点亚像素精确化（提高标定精度）
-            criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001)
-            corners_subpix = cv2.cornerSubPix(rgb_img_gray, corners, (11, 11), (-1, -1), criteria)
-
-            corners_subpix = corners_subpix.reshape(pattern_size[1], pattern_size[0], 2)
-            mask = np.zeros(rgb_img_gray.shape, dtype=np.uint8)
-
-            for i in range(0, pattern_size[1] - 1):
-                for j in range(0, pattern_size[0] - 1):
-                    pts = np.array([
-                        corners_subpix[i, j],
-                        corners_subpix[i, j + 1],
-                        corners_subpix[i + 1, j + 1],
-                        corners_subpix[i + 1, j]
-                    ], dtype=np.int32)
-                    cv2.fillConvexPoly(mask, pts, 1)
-
-            rgb_img[mask > 0] = rgb_img[mask > 0] * 0.5 + np.array([0, 0, 255]) * 0.5
-            orig_shape = rgb_img_gray.shape
-            mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
-            depth_crop, mask_crop, (x_min, y_min) = crop_mask_bbox(depth_img, mask)
-
-            if depth_crop is None:
-                return None, None
-
-            intrinsics = o3d.camera.PinholeCameraIntrinsic(
-                int(self.camera_size[0]),
-                int(self.camera_size[1]),
-                self.K[0],
-                self.K[4],
-                self.K[2] - x_min,
-                self.K[5] - y_min
-            )
-
-            x, y, z, rw, rx, ry, rz = self.calculate_function(mask_crop, depth_crop, intrinsics)
-            self.get_logger().info(f"{x}, {y}, {z}, {rw}, {rx}, {ry}, {rz}")
-
-            if (x, y, z) != (0.0, 0.0, 0.0):
-                pose = Pose()
-                pose.position = Point(x=x, y=y, z=z)
-                pose.orientation = Quaternion(w=rw, x=rx, y=ry, z=rz)
-                pose_dict[int(99), 'crossboard'].append(pose)
-
-            cv2.putText(rgb_img, f'cs: x: {x:.3f}, y: {y:.3f}, z: {z:.3f}', (0, 0 + 30),cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2)
-            cv2.putText(rgb_img, f'quat: rw: {rw:.3f}, rx: {rx:.3f}, ry: {ry:.3f}, rz: {rz:.3f}', (0, 0 + 70), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        else:
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), None
-
-
-def main(args=None):
-    rclpy.init(args=args)
-    node = DetectNode('detect')
-    try:
-        rclpy.spin(node)
-    except KeyboardInterrupt:
-        pass
-    finally:
-        node.destroy_node()
-        rclpy.shutdown()
-
-
-if __name__ == '__main__':
-    main()
--- a/vision_detect/vision_detect/detect_bottle_service.py
+++ b/vision_detect/vision_detect/detect_bottle_service.py
@@ -1,575 +0,0 @@
-import os
-import time
-import threading
-import ast
-from collections import defaultdict
-from ament_index_python.packages import get_package_share_directory
-
-import cv2
-import open3d as o3d
-import numpy as np
-import transforms3d as tfs
-from cv_bridge import CvBridge
-
-import torch
-from ultralytics import YOLO
-
-import rclpy
-from rclpy.node import Node
-
-from sensor_msgs.msg import Image
-from geometry_msgs.msg import Pose, Point, Quaternion
-from interfaces.msg import PoseClassAndID, ImgMsg
-from interfaces.srv import VisionObjectRecognition
-
-
-share_dir = get_package_share_directory('vision_detect')
-
-
-def draw_pointcloud(pcd, T):
-
-    R = T[0:3, 0:3]
-    point = T[0:3, 3:4].flatten()
-    x, y, z = R[:, 0], R[:, 1], R[:, 2]
-
-    points = [
-        [0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1],
-        point, point + x, point + y, point + z
-
-    ]  # 画点：原点、第一主成分、第二主成分
-    lines = [
-        [0, 1], [0, 2], [0, 3],
-        [4, 5], [4, 6], [4, 7]
-    ]  # 画出三点之间两两连线
-    colors = [
-        [1, 0, 0], [0, 1, 0], [0, 0, 1],
-        [1, 0, 0], [0, 1, 0], [0, 0, 1]
-    ]
-
-    line_set = o3d.geometry.LineSet(points=o3d.utility.Vector3dVector(points), lines=o3d.utility.Vector2iVector(lines))
-    line_set.colors = o3d.utility.Vector3dVector(colors)
-
-    pcd.append(line_set)
-    o3d.visualization.draw_geometries(pcd)
-
-
-def preprocess_point_cloud(pcd, voxel_size):
-
-    pcd_down = pcd.voxel_down_sample(voxel_size)
-    radius_normal = voxel_size * 2
-
-    pcd_down.estimate_normals(o3d.geometry.KDTreeSearchParamHybrid(radius=radius_normal, max_nn=30))
-
-    radius_feature = voxel_size * 5
-
-    pcd_fpfh = o3d.pipelines.registration.compute_fpfh_feature(
-        pcd_down,
-        o3d.geometry.KDTreeSearchParamHybrid(radius=radius_feature, max_nn=100)
-    )
-
-    return pcd_down, pcd_fpfh
-
-def prepare_dataset(source, target, voxel_size):
-
-    trans_init = np.identity(4)
-    source.transform(trans_init)
-
-    source_down, source_fpfh = preprocess_point_cloud(source, voxel_size)
-    target_down, target_fpfh = preprocess_point_cloud(target, voxel_size)
-
-    return source_down, target_down, source_fpfh, target_fpfh
-
-
-def execute_global_registration(source_down, target_down, source_fpfh, target_fpfh, voxel_size):
-
-    distance_threshold = voxel_size * 1.5
-    result = o3d.pipelines.registration.registration_ransac_based_on_feature_matching(
-        source_down,
-        target_down,
-        source_fpfh,
-        target_fpfh,
-        True,
-        distance_threshold,
-        o3d.pipelines.registration.TransformationEstimationPointToPoint(False),
-        3,
-        [
-            o3d.pipelines.registration.CorrespondenceCheckerBasedOnEdgeLength(0.9),
-            o3d.pipelines.registration.CorrespondenceCheckerBasedOnDistance(distance_threshold)
-        ],
-        o3d.pipelines.registration.RANSACConvergenceCriteria(100000, 0.999))
-
-    return result.transformation
-
-
-def object_icp(
-        target: o3d.geometry.PointCloud,
-        source: o3d.geometry.PointCloud | str,
-        ransac_voxel_size: float = 0.005,
-        icp_voxel_radius: list[float] | None = None,
-        icp_max_iter: list[int] | None = None,
-):
-
-    if icp_voxel_radius is None:
-        icp_voxel_radius = [0.004, 0.002, 0.001]
-
-    if icp_max_iter is None:
-        icp_max_iter = [50, 30, 14]
-
-    if isinstance(source, str):
-        source = o3d.io.read_point_cloud(source)
-    elif isinstance(source, o3d.geometry.PointCloud):
-        pass
-    else:
-        raise TypeError(f"Unsupported Type {type(source)}")
-
-    voxel_size = 0.005  # means 5mm for this dataset
-    source_down, target_down, source_fpfh, target_fpfh = prepare_dataset(source, target, voxel_size)
-
-    T = execute_global_registration(
-        source_down,
-        target_down,
-        source_fpfh,
-        target_fpfh,
-        ransac_voxel_size
-    )
-
-    for scale in range(3):
-        iter = icp_max_iter[scale]
-        radius = icp_voxel_radius[scale]
-        # print([iter, radius, scale])
-
-        source_down = source.voxel_down_sample(radius)
-        target_down = target.voxel_down_sample(radius)
-
-        source_down.estimate_normals(o3d.geometry.KDTreeSearchParamHybrid(radius=radius * 2, max_nn=30))
-        target_down.estimate_normals(o3d.geometry.KDTreeSearchParamHybrid(radius=radius * 2, max_nn=30))
-
-        result_icp = o3d.pipelines.registration.registration_icp(
-            source_down,
-            target_down,
-            radius * 5,
-            T,
-            o3d.pipelines.registration.TransformationEstimationPointToPoint(),
-            o3d.pipelines.registration.ICPConvergenceCriteria(
-                relative_fitness=1e-6,
-                relative_rmse=1e-6,
-                max_iteration=iter
-            )
-        )
-
-        T = result_icp.transformation
-
-    draw_pointcloud([source.transform(T), target], T)
-    
-    return T
-
-
-def get_map(K, D, camera_size):
-    h, w = camera_size[::-1]
-    K = np.array(K).reshape(3, 3)
-    D = np.array(D)
-    new_K, _ = cv2.getOptimalNewCameraMatrix(K, D, (w, h), 1, (w, h))
-    map1, map2 = cv2.initUndistortRectifyMap(K, D, None, new_K, (w, h), cv2.CV_32FC1)
-
-    return map1, map2, new_K.flatten()
-
-
-def pca(data, sort=True):
-    """主成分分析 """
-    center = np.mean(data, axis=0)
-    centered_points = data - center    # 去中心化
-
-    try:
-        cov_matrix = np.cov(centered_points.T)  # 转置
-        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
-
-    except np.linalg.LinAlgError:
-        return None, None
-
-    if sort:
-        sort = eigenvalues.argsort()[::-1]  # 降序排列
-        eigenvalues = eigenvalues[sort]  # 索引
-        eigenvectors = eigenvectors[:, sort]
-
-    return eigenvalues, eigenvectors
-
-def calculate_pose_cpu(mask, depth_img: np.ndarray, intrinsics, model_pcd):
-    """计算位态"""
-    depth_img_mask = np.zeros_like(depth_img)
-    depth_img_mask[mask > 0] = depth_img[mask > 0]
-
-    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
-
-    point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
-        depth=depth_o3d,
-        intrinsic=intrinsics,
-        depth_scale=1000.0,
-        depth_trunc=2.0,
-    )
-
-    point_cloud = point_cloud.remove_non_finite_points()
-
-    clean_pcd, _ = point_cloud.remove_radius_outlier(nb_points=10, radius=0.1)
-    clean_pcd, _ = clean_pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=3.0)
-
-    if len(clean_pcd.points) == 0:
-        return 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
-    
-    rmat = object_icp(model_pcd, clean_pcd)
-
-    x, y, z = rmat[0:3, 3:4].flatten()
-
-    rw, rx, ry, rz = tfs.quaternions.mat2quat(rmat[0:3, 0:3])
-
-    return x, y, z, rw, rx, ry, rz
-
-
-def draw_box(set_confidence, rgb_img, result):
-    """绘制目标检测边界框"""
-    boxes = result.boxes.xywh.cpu().numpy()
-    confidences = result.boxes.conf.cpu().numpy()
-    class_ids = result.boxes.cls.cpu().numpy()
-    labels = result.names
-
-    for i, box in enumerate(boxes):
-        if confidences[i] >= set_confidence:
-            x_center, y_center, width, height = box[:4]
-
-            p1 = [int((x_center - width / 2)), int((y_center - height / 2))]
-            p2 = [int((x_center + width / 2)), int((y_center + height / 2))]
-            cv2.rectangle(rgb_img, p1, p2, (255, 255, 0), 2)
-
-            cv2.putText(rgb_img, f'{labels[class_ids[i]]}: {confidences[i]*100:.2f}', (p1[0], p1[1] - 10),
-                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2)
-
-
-def draw_mask(set_confidence, rgb_img, result):
-    """绘制实例分割mask"""
-    masks = result.masks.data.cpu().numpy()
-    orig_shape = result.masks.orig_shape
-    confidences = result.boxes.conf.cpu().numpy()
-
-    for i, mask in enumerate(masks):
-        if confidences[i] >= set_confidence:
-            mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
-            rgb_img[mask > 0] = rgb_img[mask > 0] * 0.5 + np.array([0, 0, 255]) * 0.5
-        else:
-            continue
-
-
-def distortion_correction(color_image, depth_image, map1, map2):
-    """畸变矫正"""
-    undistorted_color = cv2.remap(color_image, map1, map2, cv2.INTER_LINEAR)
-    undistorted_color = undistorted_color.astype(color_image.dtype)
-
-    undistorted_depth = cv2.remap(depth_image, map1, map2, cv2.INTER_NEAREST)
-    undistorted_depth = undistorted_depth.astype(depth_image.dtype)
-
-    return undistorted_color, undistorted_depth
-
-
-def crop_mask_bbox(depth_img, mask, box):
-    """
-    输入:
-        depth_img: H x W
-        mask: H x W (0/1 或 bool)
-    输出:
-        depth_crop, mask_crop
-    """
-    high, width = depth_img.shape
-    x_center, y_center, w, h = box[:4]
-
-    x_min, x_max = int(round(x_center - w/2)), int(round(x_center + w/2))
-    y_min, y_max = int(round(y_center - h/2)), int(round(y_center + h/2))
-
-    depth_crop = depth_img[max(0, y_min):min(y_max, high) + 1, max(0, x_min):min(x_max, width) + 1]
-    mask_crop = mask[max(0, y_min):min(y_max, high) + 1, max(0, x_min):min(x_max, width) + 1]
-
-    return depth_crop, mask_crop, (max(0, x_min), max(0, y_min))
-
-
-def calculate_grav_width(mask, K, depth):
-    """计算重心宽度"""
-    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    if contours:
-        box = cv2.boxPoints(cv2.minAreaRect(contours[0]))
-        if np.linalg.norm(box[1] - box[0]) < np.linalg.norm(box[1] - box[2]):
-            point_diff = box[1] - box[0]
-        else:
-            point_diff = box[1] - box[2]
-
-        grab_width = depth * np.sqrt(point_diff[0]**2 / K[0]**2 + point_diff[1]**2 / K[4]**2)
-        return grab_width
-    else:
-        return 0.0
-
-
-class DetectNode(Node):
-    def __init__(self, name, mode):
-        super().__init__(name)
-        self.mode = mode
-        self.device = None
-        self.checkpoint_path = None
-        self.checkpoint_name = None
-        self.output_boxes = None
-        self.output_masks = None
-        self.function = None
-        self.calculate_function = None
-
-        self.K = None
-
-        self.fx = self.fy = 0.5
-        self.camera_data = {}
-
-        self.cv_bridge = CvBridge()
-        self.lock = threading.Lock()
-        
-        '''init'''
-        self._init_param()
-
-        self.source_model = o3d.io.read_point_cloud(self.source_model_path)
-
-        if mode == 'detect':
-            self._init_model()
-        else:
-            self.function = None
-            self.get_logger().error('Error: Mode Unknown')
-
-        if self.device == 'cpu':
-            self.calculate_function = calculate_pose_cpu
-        elif self.device == 'gpu':
-            raise NotImplementedError('Function: calculate_pose_gpu not implemented')
-        else:
-            raise ValueError(f"device must be cpu or gpu, now {self.device}")
-
-        self._init_publisher()
-        self._init_subscriber()
-        self._init_service()
-
-    def _init_param(self):
-        """init parameter"""
-        self.declare_parameter('checkpoint_name', 'yolo11s-seg.pt')
-        self.checkpoint_name = self.get_parameter('checkpoint_name').value
-        self.checkpoint_path = os.path.join(share_dir, 'checkpoints', self.checkpoint_name)
-
-        self.source_model_path = os.path.join(share_dir, 'pointclouds/bottle_model.pcd')
-
-        self.declare_parameter('output_boxes', True)
-        self.output_boxes = self.get_parameter('output_boxes').value
-
-        self.declare_parameter('output_masks', False)
-        self.output_masks = self.get_parameter('output_masks').value
-
-        self.declare_parameter('set_confidence', 0.25)
-        self.set_confidence = self.get_parameter('set_confidence').value
-
-        self.declare_parameter('device', 'cpu')
-        self.device = self.get_parameter('device').value
-
-        self.declare_parameter('classes', 'None')
-        self.classes = ast.literal_eval(self.get_parameter('classes').value)
-
-    def _init_model(self):
-        """init model"""
-        device_model = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        try:
-            self.model = YOLO(self.checkpoint_path).to(device_model)
-        except Exception as e:
-            self.get_logger().error(f'Failed to load YOLO model: {e}')
-            raise
-        self.get_logger().info(f'Loading checkpoint from: {self.checkpoint_path}')
-
-        if self.checkpoint_name.endswith('-seg.pt'):
-            self.function = self._seg_image
-        else:
-            self.function = None
-            self.get_logger().error(f'Unknown checkpoint: {self.checkpoint_name}')
-
-    def _init_publisher(self):
-        """init publisher"""
-        if self.output_boxes or self.output_masks:
-            self.pub_detect_image = self.create_publisher(Image, '/image/detect_image', 10)
-
-    def _init_service(self):
-        """init service server"""
-        self.server = self.create_service(
-            VisionObjectRecognition, 
-            "/vision_object_recognition", 
-            self._service_callback
-        )
-
-    def _init_subscriber(self):
-        """init subscriber"""
-        self.sub_img = self.create_subscription(
-            ImgMsg,
-            "/img_msg",
-            self._sub_callback, 
-            10
-        )
-
-    def _sub_callback(self, msg):
-        """同步回调函数"""
-        with self.lock:
-            self.camera_data[msg.position] = [
-                msg.image_color, 
-                msg.image_depth, 
-                msg.karr, 
-                msg.darr
-            ]
-
-    def _service_callback(self, request, response):
-        response.header.stamp = self.get_clock().now().to_msg()
-        response.header.frame_id = "camera_detect"
-
-        with self.lock:
-            if request.camera_position in self.camera_data:
-                color_img_ros, depth_img_ros, self.K, D = self.camera_data[request.camera_position]
-            else:
-                response.success = False
-                response.info = f"{request.camera_position} Camera data is empty or name is wrong"
-                response.objects = []
-
-                return response
-        
-        self.camera_size = [color_img_ros.width, color_img_ros.height]
-        
-        color_img_cv = self.cv_bridge.imgmsg_to_cv2(color_img_ros, "bgr8")
-        depth_img_cv = self.cv_bridge.imgmsg_to_cv2(depth_img_ros, '16UC1')
-
-        map1, map2, self.K = get_map(self.K, D, self.camera_size)
-        color_img_cv, depth_img_cv = distortion_correction(color_img_cv, depth_img_cv, map1, map2)
-
-        img, pose_dict = self.function(color_img_cv, depth_img_cv)
-
-        """masks为空，结束这一帧"""
-        if self.output_boxes or self.output_masks:
-            if img is None:
-                img = color_img_ros
-            self.pub_detect_image.publish(img)
-
-        if pose_dict:
-            response.info = "Success get pose"
-            response.success = True
-            # self.get_logger().info('get_pose')
-            for (class_id, class_name), pose_list in pose_dict.items():
-                response.objects.append(
-                    PoseClassAndID(
-                        class_name = class_name,
-                        class_id = class_id,
-                        pose_list = pose_list
-                    )
-                )
-        
-        else:
-            response.info = "pose dict is empty"
-            response.success = False
-            response.objects = []
-
-        return response
-
-    def _seg_image(self, rgb_img: np.ndarray, depth_img: np.ndarray):
-        """Use segmentation model"""
-        pose_dict = defaultdict(list)
-
-        '''Get Predict Results'''
-        # time1 = time.time()
-        results = self.model(rgb_img, retina_masks=True, conf=self.set_confidence, classes=[39])
-        # time2 = time.time()
-        result = results[0]
-
-        '''Get masks'''
-        if result.masks is None or len(result.masks) == 0:
-            return None, None
-        masks = result.masks.data.cpu().numpy()
-        orig_shape = result.masks.orig_shape
-
-        '''Get boxes'''
-        boxes = result.boxes.data.cpu().numpy()
-
-        class_ids = result.boxes.cls.cpu().numpy()
-        labels = result.names
-
-        # time3 = time.time()
-
-        for i, (mask, box) in enumerate(zip(masks, boxes)):
-            mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
-            depth_crop, mask_crop, (x_min, y_min) = crop_mask_bbox(depth_img, mask, box)
-            
-            if depth_crop is None:
-                continue
-
-            if mask.shape[0] >= (orig_shape[0] * 0.5) and mask.shape[1] >= (orig_shape[1] * 0.5):
-                mask_crop = cv2.resize(mask_crop, None, fx=self.fx, fy=self.fy, interpolation=cv2.INTER_NEAREST)
-                depth_crop = cv2.resize(depth_crop, None, fx=self.fx, fy=self.fy, interpolation=cv2.INTER_NEAREST)
-                intrinsics = o3d.camera.PinholeCameraIntrinsic(
-                int(self.camera_size[0] * self.fx),
-                int(self.camera_size[1] * self.fy),
-                self.K[0] * self.fx,
-                self.K[4] * self.fy,
-                (self.K[2] - x_min) * self.fx,
-                (self.K[5] - y_min) * self.fy
-            )
-            else:
-                intrinsics = o3d.camera.PinholeCameraIntrinsic(
-                    int(self.camera_size[0]),
-                    int(self.camera_size[1]),
-                    self.K[0],
-                    self.K[4],
-                    self.K[2] - x_min,
-                    self.K[5] - y_min
-                )
-
-            x, y, z, rw, rx, ry, rz = self.calculate_function(mask_crop, depth_crop, intrinsics, self.source_model)
-
-            grab_width = calculate_grav_width(mask, self.K, z)
-            z = z + grab_width * 0.45
-
-            if (x, y, z) == (None, None, None):
-                self.get_logger().error("have wrong pose")
-                continue
-
-            if (x, y, z) != (0.0, 0.0, 0.0):
-                pose = Pose()
-                pose.position = Point(x=x, y=y, z=z)
-                pose.orientation = Quaternion(w=rw, x=rx, y=ry, z=rz)
-                pose_dict[int(class_ids[i]), labels[class_ids[i]]].append(pose)
-
-        # time4 = time.time()
-
-
-        # self.get_logger().info(f'start')
-        # self.get_logger().info(f'{(time2 - time1)*1000} ms, model predict')
-        # self.get_logger().info(f'{(time3 - time2)*1000} ms, get mask and some param')
-        # self.get_logger().info(f'{(time4 - time3)*1000} ms, calculate all mask PCA')
-        # self.get_logger().info(f'{(time4 - time1)*1000} ms, completing a picture entire process')
-        # self.get_logger().info(f'end')
-
-        '''mask_img and box_img is or not output'''
-        if self.output_boxes and not self.output_masks:
-            draw_box(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        elif self.output_boxes and self.output_masks:
-            draw_box(self.set_confidence, rgb_img, result)
-            draw_mask(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        elif not self.output_boxes and self.output_masks:
-            draw_mask(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        else:
-            return None, pose_dict
-
-
-def main(args=None):
-    rclpy.init(args=args)
-    node = DetectNode('detect', 'detect')
-    try:
-        rclpy.spin(node)
-    except KeyboardInterrupt:
-        pass
-    finally:
-        node.destroy_node()
-        rclpy.shutdown()
-
-
-if __name__ == '__main__':
-    main()
--- a/vision_detect/vision_detect/detect_box_service.py
+++ b/vision_detect/vision_detect/detect_box_service.py
@@ -1,446 +0,0 @@
-import os
-import time
-import threading
-import ast
-from collections import defaultdict
-from ament_index_python.packages import get_package_share_directory
-
-import cv2
-import open3d as o3d
-import numpy as np
-import transforms3d as tfs
-from cv_bridge import CvBridge
-
-import torch
-from ultralytics import YOLO
-
-import rclpy
-from rclpy.node import Node
-
-from sensor_msgs.msg import Image
-from geometry_msgs.msg import Pose, Point, Quaternion
-from interfaces.msg import PoseClassAndID, ImgMsg
-from interfaces.srv import VisionObjectRecognition
-
-
-share_dir = get_package_share_directory('vision_detect')
-
-
-def get_map(K, D, camera_size):
-    h, w = camera_size[::-1]
-    K = np.array(K).reshape(3, 3)
-    D = np.array(D)
-    new_K, _ = cv2.getOptimalNewCameraMatrix(K, D, (w, h), 1, (w, h))
-    map1, map2 = cv2.initUndistortRectifyMap(K, D, None, new_K, (w, h), cv2.CV_32FC1)
-
-    return map1, map2, new_K.flatten()
-
-
-def pca(data, sort=True):
-    """主成分分析 """
-    center = np.mean(data, axis=0)
-    centered_points = data - center    # 去中心化
-
-    try:
-        cov_matrix = np.cov(centered_points.T)  # 转置
-        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
-
-    except np.linalg.LinAlgError:
-        return None, None
-
-    if sort:
-        sort = eigenvalues.argsort()[::-1]  # 降序排列
-        eigenvalues = eigenvalues[sort]  # 索引
-        eigenvectors = eigenvectors[:, sort]
-
-    return eigenvalues, eigenvectors
-
-def calculate_pose_cpu(mask, depth_img: np.ndarray, intrinsics, hand_eye_mat):
-    """计算位态"""
-    depth_img_mask = np.zeros_like(depth_img)
-    depth_img_mask[mask > 0] = depth_img[mask > 0]
-
-    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
-
-    point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
-        depth=depth_o3d,
-        intrinsic=intrinsics,
-        depth_scale=1000.0,
-        depth_trunc=8.0,
-    )
-
-    point_cloud = point_cloud.remove_non_finite_points()
-
-    down_pcd = point_cloud.voxel_down_sample(voxel_size=0.022)
-    clean_pcd, _ = down_pcd.remove_radius_outlier(nb_points=10, radius=0.1)
-    clean_pcd, _ = clean_pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=3.0)
-
-    if len(clean_pcd.points) == 0:
-        return 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
-    center = clean_pcd.get_center()
-    x, y, z = center
-
-    w, v = pca(np.asarray(clean_pcd.points))
-
-    if w is not None and v is not None:
-        vx, vy, vz = v[:,0], v[:,1], v[:,2]
-
-        if vx[0] < 0:
-            vx = -vx
-        if vy[1] < 0:
-            vy = -vy
-        if not np.allclose(np.cross(vx, vy), vz):
-            vz = -vz
-
-        R = np.column_stack((vx, vy, vz))
-        rmat = tfs.affines.compose(np.squeeze(np.asarray((x, y, z))), R, [1, 1, 1])
-        rmat = hand_eye_mat @ rmat
-
-        rw, rx, ry, rz = tfs.quaternions.mat2quat(rmat[0:3, 0:3])
-        x, y, z = rmat[0:3, 3].flatten()
-
-        # point = [
-        #     [x, y, z], [x, y, z] + vx, [x, y, z] + vy, [x, y, z] + vz,
-        #     [0, 0, 0], [1, 0, 0], [0, 1, 0], [0, 0, 1]
-        #     ]  # 画点：原点、第一主成分、第二主成分
-        # lines = [
-        #     [0, 1], [0, 2], [0, 3], [4, 5], [4, 6], [4, 7]
-        # ]  # 画出三点之间两两连线
-        # colors = [
-        #     [1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0], [0, 0, 1]
-        # ]
-        # # 构造open3d中的LineSet对象，用于主成分显示
-        # line_set = o3d.geometry.LineSet(points=o3d.utility.Vector3dVector(point), lines=o3d.utility.Vector2iVector(lines))
-        # line_set.colors = o3d.utility.Vector3dVector(colors)
-        # o3d.visualization.draw_geometries([point_cloud, line_set])
-        # o3d.visualization.draw_geometries([clean_pcd, line_set])
-
-        return x, y, z, rw, rx, ry, rz
-
-
-def draw_box(set_confidence, rgb_img, result):
-    """绘制目标检测边界框"""
-    boxes = result.boxes.xywh.cpu().numpy()
-    confidences = result.boxes.conf.cpu().numpy()
-    class_ids = result.boxes.cls.cpu().numpy()
-    labels = result.names
-
-    for i, box in enumerate(boxes):
-        if confidences[i] >= set_confidence:
-            x_center, y_center, width, height = box[:4]
-
-            p1 = [int((x_center - width / 2)), int((y_center - height / 2))]
-            p2 = [int((x_center + width / 2)), int((y_center + height / 2))]
-            cv2.rectangle(rgb_img, p1, p2, (255, 255, 0), 2)
-
-            cv2.putText(rgb_img, f'{labels[class_ids[i]]}: {confidences[i]*100:.2f}', (p1[0], p1[1] - 10),
-                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2)
-
-
-def draw_mask(set_confidence, rgb_img, result):
-    """绘制实例分割mask"""
-    masks = result.masks.data.cpu().numpy()
-    orig_shape = result.masks.orig_shape
-    confidences = result.boxes.conf.cpu().numpy()
-
-    for i, mask in enumerate(masks):
-        if confidences[i] >= set_confidence:
-            mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
-            rgb_img[mask > 0] = rgb_img[mask > 0] * 0.5 + np.array([0, 0, 255]) * 0.5
-        else:
-            continue
-
-
-def distortion_correction(color_image, depth_image, map1, map2):
-    """畸变矫正"""
-    undistorted_color = cv2.remap(color_image, map1, map2, cv2.INTER_LINEAR)
-    undistorted_color = undistorted_color.astype(color_image.dtype)
-
-    undistorted_depth = cv2.remap(depth_image, map1, map2, cv2.INTER_NEAREST)
-    undistorted_depth = undistorted_depth.astype(depth_image.dtype)
-
-    return undistorted_color, undistorted_depth
-
-
-def crop_mask_bbox(depth_img, mask, box):
-    """
-    输入:
-        depth_img: H x W
-        mask: H x W (0/1 或 bool)
-    输出:
-        depth_crop, mask_crop
-    """
-    high, width = depth_img.shape
-    x_center, y_center, w, h = box[:4]
-
-    x_min, x_max = int(round(x_center - w/2)), int(round(x_center + w/2))
-    y_min, y_max = int(round(y_center - h/2)), int(round(y_center + h/2))
-
-    depth_crop = depth_img[max(0, y_min):min(y_max, high) + 1, max(0, x_min):min(x_max, width) + 1]
-    mask_crop = mask[max(0, y_min):min(y_max, high) + 1, max(0, x_min):min(x_max, width) + 1]
-
-    return depth_crop, mask_crop, (max(0, x_min), max(0, y_min))
-
-
-
-class DetectNode(Node):
-    def __init__(self, name, mode):
-        super().__init__(name)
-        self.mode = mode
-        self.device = None
-        self.checkpoint_path = None
-        self.checkpoint_name = None
-        self.output_boxes = None
-        self.output_masks = None
-        self.function = None
-        self.calculate_function = None
-        self.eye_in_hand_mat = None
-        self.eye_to_hand_mat = None
-
-        self.K = None
-
-        self.fx = self.fy = 0.5
-        self.camera_data = {}
-
-        self.cv_bridge = CvBridge()
-        self.lock = threading.Lock()
-        
-        '''init'''
-        self._init_param()
-
-
-        if mode == 'detect':
-            self._init_model()
-        else:
-            self.function = None
-            self.get_logger().error('Error: Mode Unknown')
-
-        if self.device == 'cpu':
-            self.calculate_function = calculate_pose_cpu
-        elif self.device == 'gpu':
-            raise NotImplementedError('Function: calculate_pose_gpu not implemented')
-        else:
-            raise ValueError(f"device must be cpu or gpu, now {self.device}")
-
-        self._init_publisher()
-        self._init_subscriber()
-        self._init_service()
-
-    def _init_param(self):
-        """init parameter"""
-        self.declare_parameter('checkpoint_name', 'hivecorebox-seg.pt')
-        self.checkpoint_name = self.get_parameter('checkpoint_name').value
-        self.checkpoint_path = os.path.join(share_dir, 'checkpoints', self.checkpoint_name)
-
-        self.declare_parameter('output_boxes', True)
-        self.output_boxes = self.get_parameter('output_boxes').value
-
-        self.declare_parameter('output_masks', True)
-        self.output_masks = self.get_parameter('output_masks').value
-
-        self.declare_parameter('set_confidence', 0.60)
-        self.set_confidence = self.get_parameter('set_confidence').value
-
-        self.declare_parameter('device', 'cpu')
-        self.device = self.get_parameter('device').value
-
-        self.declare_parameter('classes', 'None')
-        self.classes = ast.literal_eval(self.get_parameter('classes').value)
-
-        self.declare_parameter('eye_in_hand', [1.0, 0.0, 0.0, 0.0,
-                                              0.0, 1.0, 0.0, 0.0,
-                                              0.0, 0.0, 1.0, 0.0,
-                                              0.0, 0.0, 0.0, 1.0])
-        self.eye_in_hand_mat = np.array(self.get_parameter('eye_in_hand').value).reshape(4, 4)
-
-        self.declare_parameter('eye_to_hand', [1.0, 0.0, 0.0, 0.0,
-                                              0.0, 1.0, 0.0, 0.0,
-                                              0.0, 0.0, 1.0, 0.0,
-                                              0.0, 0.0, 0.0, 1.0])
-        self.eye_to_hand_mat = np.array(self.get_parameter('eye_to_hand').value).reshape(4, 4)
-
-    def _init_model(self):
-        """init model"""
-        device_model = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        try:
-            self.model = YOLO(self.checkpoint_path).to(device_model)
-        except Exception as e:
-            self.get_logger().error(f'Failed to load YOLO model: {e}')
-            raise
-        self.get_logger().info(f'Loading checkpoint from: {self.checkpoint_path}')
-
-        if self.checkpoint_name.endswith('-seg.pt'):
-            self.function = self._seg_image
-        else:
-            self.function = None
-            self.get_logger().error(f'Unknown checkpoint: {self.checkpoint_name}')
-
-    def _init_publisher(self):
-        """init publisher"""
-        if self.output_boxes or self.output_masks:
-            self.pub_detect_image = self.create_publisher(Image, '/image/detect_image', 10)
-
-    def _init_service(self):
-        """init service server"""
-        self.server = self.create_service(
-            VisionObjectRecognition, 
-            "/vision_object_recognition", 
-            self._service_callback
-        )
-
-    def _init_subscriber(self):
-        """init subscriber"""
-        self.sub_img = self.create_subscription(
-            ImgMsg,
-            "/img_msg",
-            self._sub_callback, 
-            10
-        )
-
-    def _sub_callback(self, msg):
-        """同步回调函数"""
-        with self.lock:
-            self.camera_data[msg.position] = [
-                msg.image_color, 
-                msg.image_depth, 
-                msg.karr, 
-                msg.darr
-            ]
-
-    def _service_callback(self, request, response):
-        response.header.stamp = self.get_clock().now().to_msg()
-        response.header.frame_id = "camera_detect"
-
-        with self.lock:
-            if request.camera_position in self.camera_data:
-                color_img_ros, depth_img_ros, self.K, D = self.camera_data[request.camera_position]
-            else:
-                response.success = False
-                response.info = f"{request.camera_position} Camera data is empty or name is wrong"
-                response.objects = []
-
-                return response
-
-        if request.camera_position == 'left' or request.camera_position == 'right':
-            hand_eye_mat = self.eye_in_hand_mat
-        else:
-            hand_eye_mat = self.eye_to_hand_mat
-        
-        self.camera_size = [color_img_ros.width, color_img_ros.height]
-        
-        color_img_cv = self.cv_bridge.imgmsg_to_cv2(color_img_ros, "bgr8")
-        depth_img_cv = self.cv_bridge.imgmsg_to_cv2(depth_img_ros, '16UC1')
-
-        map1, map2, self.K = get_map(self.K, D, self.camera_size)
-        color_img_cv, depth_img_cv = distortion_correction(color_img_cv, depth_img_cv, map1, map2)
-
-        img, pose_dict = self.function(color_img_cv, depth_img_cv, hand_eye_mat)
-
-        """masks为空，结束这一帧"""
-        if self.output_boxes or self.output_masks:
-            if img is None:
-                img = color_img_ros
-            self.pub_detect_image.publish(img)
-
-        if pose_dict:
-            response.info = "Success get pose"
-            response.success = True
-            # self.get_logger().info('get_pose')
-            for (class_id, class_name), pose_list in pose_dict.items():
-                response.objects.append(
-                    PoseClassAndID(
-                        class_name = class_name,
-                        class_id = class_id,
-                        pose_list = pose_list
-                    )
-                )
-        
-        else:
-            response.info = "pose dict is empty"
-            response.success = False
-            response.objects = []
-
-        return response
-
-    def _seg_image(self, rgb_img: np.ndarray, depth_img: np.ndarray, hand_eye_mat):
-        """Use segmentation model"""
-        pose_dict = defaultdict(list)
-
-        depth_filter_mask = np.zeros_like(depth_img, dtype=np.uint8)
-        depth_filter_mask[(depth_img > 0) & (depth_img < 2000)] = 1
-
-        rgb_img[depth_filter_mask == 0] = 0
-
-        '''Get Predict Results'''
-        results = self.model(rgb_img, retina_masks=True, conf=self.set_confidence, classes=[0])
-        result = results[0]
-
-        '''Get masks'''
-        if result.masks is None or len(result.masks) == 0:
-            return None, None
-        masks = result.masks.data.cpu().numpy()
-        orig_shape = result.masks.orig_shape
-
-        '''Get boxes'''
-        boxes = result.boxes.xywh.cpu().numpy()
-
-        class_ids = result.boxes.cls.cpu().numpy()
-        labels = result.names
-
-        for i, (mask, box) in enumerate(zip(masks, boxes)):
-            mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
-            depth_crop, mask_crop, (x_min, y_min) = crop_mask_bbox(depth_img, mask, box)
-            
-            if depth_crop is None:
-                continue
-
-            intrinsics = o3d.camera.PinholeCameraIntrinsic(
-                int(self.camera_size[0]),
-                int(self.camera_size[1]),
-                self.K[0],
-                self.K[4],
-                self.K[2] - x_min,
-                self.K[5] - y_min
-            )
-
-            x, y, z, rw, rx, ry, rz = self.calculate_function(mask_crop, depth_crop, intrinsics, hand_eye_mat)
-
-            if (x, y, z) == (None, None, None):
-                self.get_logger().error("have wrong pose")
-                continue
-
-            if (x, y, z) != (0.0, 0.0, 0.0):
-                pose = Pose()
-                pose.position = Point(x=x, y=y, z=z)
-                pose.orientation = Quaternion(w=rw, x=rx, y=ry, z=rz)
-                pose_dict[int(class_ids[i]), labels[class_ids[i]]].append(pose)
-
-        '''mask_img and box_img is or not output'''
-        if self.output_boxes and not self.output_masks:
-            draw_box(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        elif self.output_boxes and self.output_masks:
-            draw_box(self.set_confidence, rgb_img, result)
-            draw_mask(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        elif not self.output_boxes and self.output_masks:
-            draw_mask(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        else:
-            return None, pose_dict
-
-
-def main(args=None):
-    rclpy.init(args=args)
-    node = DetectNode('detect', 'detect')
-    try:
-        rclpy.spin(node)
-    except KeyboardInterrupt:
-        pass
-    finally:
-        node.destroy_node()
-        rclpy.shutdown()
-
-
-if __name__ == '__main__':
-    main()
--- a/vision_detect/vision_detect/detect_service.py
+++ b/vision_detect/vision_detect/detect_service.py
@@ -1,415 +0,0 @@
-import os
-import time
-import threading
-import ast
-from collections import defaultdict
-from ament_index_python.packages import get_package_share_directory
-
-import cv2
-import open3d as o3d
-import numpy as np
-import transforms3d as tfs
-from cv_bridge import CvBridge
-
-import torch
-from ultralytics import YOLO
-
-import rclpy
-from rclpy.node import Node
-
-from sensor_msgs.msg import Image
-from geometry_msgs.msg import Pose, Point, Quaternion
-from interfaces.msg import PoseClassAndID, ImgMsg
-from interfaces.srv import VisionObjectRecognition
-
-
-share_dir = get_package_share_directory('vision_detect')
-
-
-def get_map(K, D, camera_size):
-    h, w = camera_size[::-1]
-    K = np.array(K).reshape(3, 3)
-    D = np.array(D)
-    new_K, _ = cv2.getOptimalNewCameraMatrix(K, D, (w, h), 1, (w, h))
-    map1, map2 = cv2.initUndistortRectifyMap(K, D, None, new_K, (w, h), cv2.CV_32FC1)
-
-    return map1, map2, new_K.flatten()
-
-
-def pca(data, sort=True):
-    """主成分分析 """
-    center = np.mean(data, axis=0)
-    centered_points = data - center    # 去中心化
-
-    try:
-        cov_matrix = np.cov(centered_points.T)  # 转置
-        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
-
-    except np.linalg.LinAlgError:
-        return None, None
-
-    if sort:
-        sort = eigenvalues.argsort()[::-1]  # 降序排列
-        eigenvalues = eigenvalues[sort]  # 索引
-        eigenvectors = eigenvectors[:, sort]
-
-    return eigenvalues, eigenvectors
-
-def calculate_pose_cpu(mask, depth_img: np.ndarray, intrinsics, hand_eye_mat):
-    """计算位态"""
-    depth_img_mask = np.zeros_like(depth_img)
-    depth_img_mask[mask > 0] = depth_img[mask > 0]
-
-    depth_o3d = o3d.geometry.Image(depth_img_mask.astype(np.uint16))
-
-    point_cloud = o3d.geometry.PointCloud.create_from_depth_image(
-        depth=depth_o3d,
-        intrinsic=intrinsics,
-        depth_scale=1000.0,
-        depth_trunc=3.0,
-    )
-
-    point_cloud = point_cloud.remove_non_finite_points()
-
-    down_pcd = point_cloud.voxel_down_sample(voxel_size=0.020)
-    clean_pcd, _ = down_pcd.remove_radius_outlier(nb_points=10, radius=0.1)
-    clean_pcd, _ = clean_pcd.remove_statistical_outlier(nb_neighbors=20, std_ratio=3.0)
-
-    if len(clean_pcd.points) == 0:
-        return 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
-    center = clean_pcd.get_center()
-    x, y, z = center
-
-    w, v = pca(np.asarray(clean_pcd.points))
-
-    if w is not None and v is not None:
-        vx, vy, vz = v[:,0], v[:,1], v[:,2]
-
-        if vx[0] < 0:
-            vx = -vx
-        if vy[1] < 0:
-            vy = -vy
-        if not np.allclose(np.cross(vx, vy), vz):
-            vz = -vz
-
-        R = np.column_stack((vx, vy, vz))
-        rmat = tfs.affines.compose(np.squeeze(np.asarray((x, y, z))), R, [1, 1, 1])
-        rmat = hand_eye_mat @ rmat
-        rw, rx, ry, rz = tfs.quaternions.mat2quat(rmat[0:3, 0:3])
-        x, y, z = rmat[0:3, 3].flatten()
-
-        return x, y, z, rw, rx, ry, rz
-
-    else:
-        return 0.0, 0.0, 0.0, None, None, None, None
-
-
-def draw_box(set_confidence, rgb_img, result):
-    """绘制目标检测边界框"""
-    boxes = result.boxes.xywh.cpu().numpy()
-    confidences = result.boxes.conf.cpu().numpy()
-    class_ids = result.boxes.cls.cpu().numpy()
-    labels = result.names
-
-    for i, box in enumerate(boxes):
-        if confidences[i] >= set_confidence:
-            x_center, y_center, width, height = box[:4]
-
-            p1 = [int((x_center - width / 2)), int((y_center - height / 2))]
-            p2 = [int((x_center + width / 2)), int((y_center + height / 2))]
-            cv2.rectangle(rgb_img, p1, p2, (255, 255, 0), 2)
-
-            cv2.putText(rgb_img, f'{labels[class_ids[i]]}: {confidences[i]*100:.2f}', (p1[0], p1[1] - 10),
-                        cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 0), 2)
-
-
-def draw_mask(set_confidence, rgb_img, result):
-    """绘制实例分割mask"""
-    masks = result.masks.data.cpu().numpy()
-    orig_shape = result.masks.orig_shape
-    confidences = result.boxes.conf.cpu().numpy()
-
-    for i, mask in enumerate(masks):
-        if confidences[i] >= set_confidence:
-            mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
-            rgb_img[mask > 0] = rgb_img[mask > 0] * 0.5 + np.array([0, 0, 255]) * 0.5
-        else:
-            continue
-
-
-def distortion_correction(color_image, depth_image, map1, map2):
-    """畸变矫正"""
-    undistorted_color = cv2.remap(color_image, map1, map2, cv2.INTER_LINEAR)
-    undistorted_color = undistorted_color.astype(color_image.dtype)
-
-    undistorted_depth = cv2.remap(depth_image, map1, map2, cv2.INTER_NEAREST)
-    undistorted_depth = undistorted_depth.astype(depth_image.dtype)
-
-    return undistorted_color, undistorted_depth
-
-
-def crop_mask_bbox(depth_img, mask, box):
-    """
-    输入:
-        depth_img: H x W
-        mask: H x W (0/1 或 bool)
-    输出:
-        depth_crop, mask_crop
-    """
-    high, width = depth_img.shape
-    x_center, y_center, w, h = box[:4]
-
-    x_min, x_max = int(round(x_center - w/2)), int(round(x_center + w/2))
-    y_min, y_max = int(round(y_center - h/2)), int(round(y_center + h/2))
-
-    depth_crop = depth_img[max(0, y_min):min(y_max, high) + 1, max(0, x_min):min(x_max, width) + 1]
-    mask_crop = mask[max(0, y_min):min(y_max, high) + 1, max(0, x_min):min(x_max, width) + 1]
-
-    return depth_crop, mask_crop, (max(0, x_min), max(0, y_min))
-
-
-class DetectNode(Node):
-    def __init__(self, name):
-        super().__init__(name)
-        self.device = None
-        self.checkpoint_path = None
-        self.checkpoint_name = None
-        self.output_boxes = None
-        self.output_masks = None
-        self.function = None
-        self.K = None
-        self.eye_in_hand_mat = None
-        self.eye_to_hand_mat = None
-
-        self.calculate_function = calculate_pose_cpu
-        self.fx = self.fy = 0.5
-        self.camera_data = {}
-        self.cv_bridge = CvBridge()
-        self.lock = threading.Lock()
-        
-        '''init'''
-        self._init_param()
-        self._init_model()
-        self._init_publisher()
-        self._init_subscriber()
-        self._init_service()
-
-    def _init_param(self):
-        """init parameter"""
-        self.declare_parameter('checkpoint_name', 'yolo11s-seg.pt')
-        self.checkpoint_name = self.get_parameter('checkpoint_name').value
-        self.checkpoint_path = os.path.join(share_dir, 'checkpoints', self.checkpoint_name)
-
-        self.declare_parameter('output_boxes', True)
-        self.output_boxes = self.get_parameter('output_boxes').value
-
-        self.declare_parameter('output_masks', False)
-        self.output_masks = self.get_parameter('output_masks').value
-
-        self.declare_parameter('set_confidence', 0.25)
-        self.set_confidence = self.get_parameter('set_confidence').value
-
-        self.declare_parameter('classes', 'None')
-        self.classes = ast.literal_eval(self.get_parameter('classes').value)
-
-        self.declare_parameter('eye_in_hand', [1.0, 0.0, 0.0, 0.0,
-                                              0.0, 1.0, 0.0, 0.0,
-                                              0.0, 0.0, 1.0, 0.0,
-                                              0.0, 0.0, 0.0, 1.0])
-        self.eye_in_hand_mat = np.array(self.get_parameter('eye_in_hand').value).reshape(4, 4)
-
-        self.declare_parameter('eye_to_hand', [1, 0, 0, 0,
-                                              0, 1, 0, 0,
-                                              0, 0, 1, 0,
-                                              0, 0, 0, 1])
-        self.eye_to_hand_mat = np.array(self.get_parameter('eye_to_hand').value).reshape(4, 4)
-
-    def _init_model(self):
-        """init model"""
-        device_model = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        try:
-            self.model = YOLO(self.checkpoint_path).to(device_model)
-        except Exception as e:
-            self.get_logger().error(f'Failed to load YOLO model: {e}')
-            raise
-        self.get_logger().info(f'Loading checkpoint from: {self.checkpoint_path}')
-
-        if self.checkpoint_name.endswith('-seg.pt'):
-            self.function = self._seg_image
-        else:
-            self.function = None
-            self.get_logger().error(f'Unknown checkpoint: {self.checkpoint_name}')
-
-    def _init_publisher(self):
-        """init publisher"""
-        if self.output_boxes or self.output_masks:
-            self.pub_detect_image = self.create_publisher(Image, '/image/detect_image', 10)
-
-    def _init_service(self):
-        """init service server"""
-        self.server = self.create_service(
-            VisionObjectRecognition, 
-            "/vision_object_recognition", 
-            self._service_callback
-        )
-
-    def _init_subscriber(self):
-        """init subscriber"""
-        self.sub_img = self.create_subscription(
-            ImgMsg,
-            "/img_msg",
-            self._sub_callback, 
-            10
-        )
-
-    def _sub_callback(self, msg):
-        """同步回调函数"""
-        with self.lock:
-            self.camera_data[msg.position] = [
-                msg.image_color, 
-                msg.image_depth, 
-                msg.karr, 
-                msg.darr
-            ]
-
-    def _service_callback(self, request, response):
-        response.header.stamp = self.get_clock().now().to_msg()
-        response.header.frame_id = "camera_detect"
-
-        with self.lock:
-            if request.camera_position in self.camera_data:
-                color_img_ros, depth_img_ros, self.K, D = self.camera_data[request.camera_position]
-            else:
-                response.success = False
-                response.info = f"{request.camera_position} Camera data is empty or name is wrong"
-                response.objects = []
-
-                return response
-            
-        if request.camera_position == 'left' or request.camera_position == 'right':
-            hand_eye_mat = self.eye_in_hand_mat
-        else:
-            hand_eye_mat = self.eye_to_hand_mat
-        
-        self.camera_size = [color_img_ros.width, color_img_ros.height]
-        
-        color_img_cv = self.cv_bridge.imgmsg_to_cv2(color_img_ros, "bgr8")
-        depth_img_cv = self.cv_bridge.imgmsg_to_cv2(depth_img_ros, '16UC1')
-
-        map1, map2, self.K = get_map(self.K, D, self.camera_size)
-        color_img_cv, depth_img_cv = distortion_correction(color_img_cv, depth_img_cv, map1, map2)
-
-        img, pose_dict = self.function(color_img_cv, depth_img_cv, hand_eye_mat)
-
-        """masks为空，结束这一帧"""
-        if self.output_boxes or self.output_masks:
-            if img is None:
-                img = color_img_ros
-            self.pub_detect_image.publish(img)
-
-        if pose_dict:
-            response.info = "Success get pose"
-            response.success = True
-            for (class_id, class_name), pose_list in pose_dict.items():
-                response.objects.append(
-                    PoseClassAndID(
-                        class_name = class_name,
-                        class_id = class_id,
-                        pose_list = pose_list
-                    )
-                )
-
-            return response
-        
-        else:
-            response.info = "pose dict is empty"
-            response.success = False
-            response.objects = []
-
-            return response
-
-    def _seg_image(self, rgb_img: np.ndarray, depth_img: np.ndarray, hand_eye_mat):
-        """Use segmentation model"""
-        pose_dict = defaultdict(list)
-
-        '''Get Predict Results'''
-        time1 = time.time()
-        results = self.model(rgb_img, retina_masks=True, conf=self.set_confidence, classes=self.classes)
-        time2 = time.time()
-        result = results[0]
-
-        '''Get masks'''
-        if result.masks is None or len(result.masks) == 0:
-            return None, None
-        masks = result.masks.data.cpu().numpy()
-        orig_shape = result.masks.orig_shape
-
-        '''Get boxes'''
-        boxes = result.boxes.xywh.cpu().numpy()
-        class_ids = result.boxes.cls.cpu().numpy()
-        labels = result.names
-
-        time3 = time.time()
-
-        for i, (mask, box) in enumerate(zip(masks, boxes)):
-            mask = cv2.resize(mask.astype(np.uint8), orig_shape[::-1], interpolation=cv2.INTER_NEAREST)
-            depth_crop, mask_crop, (x_min, y_min) = crop_mask_bbox(depth_img, mask, box)
-            
-            if depth_crop is None:
-                continue
-
-            intrinsics = o3d.camera.PinholeCameraIntrinsic(
-                int(self.camera_size[0]),
-                int(self.camera_size[1]),
-                self.K[0],
-                self.K[4],
-                self.K[2] - x_min,
-                self.K[5] - y_min
-            )
-
-            x, y, z, rw, rx, ry, rz = self.calculate_function(mask_crop, depth_crop, intrinsics, hand_eye_mat)
-
-            if (x, y, z) != (0.0, 0.0, 0.0):
-                pose = Pose()
-                pose.position = Point(x=x, y=y, z=z)
-                pose.orientation = Quaternion(w=rw, x=rx, y=ry, z=rz)
-                pose_dict[int(class_ids[i]), labels[class_ids[i]]].append(pose)
-
-        time4 = time.time()
-
-        self.get_logger().info(f'start')
-        self.get_logger().info(f'{(time2 - time1) * 1000} ms, model predict')
-        self.get_logger().info(f'{(time3 - time2) * 1000} ms, get mask and some param')
-        self.get_logger().info(f'{(time4 - time3) * 1000} ms, calculate all mask PCA')
-        self.get_logger().info(f'{(time4 - time1) * 1000} ms, completing a picture entire process')
-        self.get_logger().info(f'end')
-
-        '''mask_img and box_img is or not output'''
-        if self.output_boxes and not self.output_masks:
-            draw_box(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        elif self.output_boxes and self.output_masks:
-            draw_box(self.set_confidence, rgb_img, result)
-            draw_mask(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        elif not self.output_boxes and self.output_masks:
-            draw_mask(self.set_confidence, rgb_img, result)
-            return self.cv_bridge.cv2_to_imgmsg(rgb_img, "bgr8"), pose_dict
-        else:
-            return None, pose_dict
-
-
-def main(args=None):
-    rclpy.init(args=args)
-    node = DetectNode('detect')
-    try:
-        rclpy.spin(node)
-    except KeyboardInterrupt:
-        pass
-    finally:
-        node.destroy_node()
-        rclpy.shutdown()
-
-
-if __name__ == '__main__':
-    main()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
liangyuxuan	cc520c349e	compliete initbase	2026-01-20 19:58:14 +08:00
liangyuxuan	52c10dcb3c	Merge branch 'feature' into feature_cpp_test	2026-01-19 14:09:29 +08:00
liangyuxuan	611b529075	修改抓取位置微调	2026-01-19 14:06:09 +08:00
liangyuxuan	039bb2d165	添加抓取位置微调、检测图片和原图片保存	2026-01-15 17:39:45 +08:00
liangyuxuan	1c9af20786	Merge branch 'feature' into feature_cpp_test	2026-01-12 09:58:05 +08:00
liangyuxuan	a59776e6c2	增加end2end方式计算位姿	2026-01-12 09:57:30 +08:00
liangyuxuan	f937329e4b	增加end2end方式计算位姿	2026-01-12 09:48:58 +08:00
liangyuxuan	6fbf13e232	增加end2end方式计算位姿	2026-01-12 09:48:54 +08:00
liangyuxuan	cd4bbdc5f3	1	2025-12-30 17:34:23 +08:00
liangyuxuan	97c142f6e3	更改去噪参数	2025-12-30 13:22:00 +08:00
liangyuxuan	eeedd4bfc6	1	2025-12-28 16:24:16 +08:00
liangyuxuan	08319d9eba	1	2025-12-28 16:19:49 +08:00
liangyuxuan	7d3424b7e5	Merge branch 'feature' into feature_cpp_test	2025-12-28 13:58:17 +08:00
liangyuxuan	3025b5a23a	采集的深度数据更改为.png存储	2025-12-28 13:57:37 +08:00
liangyuxuan	563737932b	Merge branch 'feature' into feature_cpp_test	2025-12-27 16:49:25 +08:00
liangyuxuan	93f3a8d45e	移动文件位置	2025-12-27 16:47:46 +08:00
liangyuxuan	12fed65d8a	更改深度raw的保存方式为.zip	2025-12-27 16:45:55 +08:00
liangyuxuan	87a5697e75	迁移数据采集节点至vision_tools，修改点下采样参数	2025-12-27 11:08:53 +08:00
liangyuxuan	8279687f4c	修正数据采集节点	2025-12-25 15:39:13 +08:00
liangyuxuan	aff813b107	检测物体排序修正	2025-12-24 19:48:16 +08:00
liangyuxuan	a89e525e7a	Merge branch 'feature' into feature_cpp_test	2025-12-24 17:52:42 +08:00
liangyuxuan	a5f340bc35	添加相机数据采集服务客户端	2025-12-24 17:51:56 +08:00
liangyuxuan	d730325a0d	添加相机数据采集服务客户端	2025-12-24 17:51:03 +08:00
liangyuxuan	2e4cb18fa6	添加相机数据采集服务客户端	2025-12-24 17:33:26 +08:00
liangyuxuan	facd43f54f	添加相机数据采集节点	2025-12-24 16:44:06 +08:00
liangyuxuan	59b90201e4	设置检测输出顺序为：左->右，下->上	2025-12-24 10:42:44 +08:00
liangyuxuan	db330ec33c	1	2025-12-23 18:25:22 +08:00
liangyuxuan	610f488d1e	Merge branch 'feature' into feature_cpp_test	2025-12-23 15:14:50 +08:00
liangyuxuan	d62faaa9cc	1	2025-12-23 15:13:01 +08:00
liangyuxuan	eec02de5a7	删除多余文件	2025-12-23 09:57:52 +08:00
liangyuxuan	389bd2b50e	Merge branch 'feature' into feature_cpp_test	2025-12-19 15:51:38 +08:00
liangyuxuan	b8444c82b9	头部相机标定文件	2025-12-19 15:50:59 +08:00
liangyuxuan	44b2cc43e0	add onnxruntime inference(no post-processing)	2025-12-18 18:52:49 +08:00
liangyuxuan	4f8ae43239	update test.cpp CMakeLists.txt	2025-12-18 09:07:09 +08:00
liangyuxuan	78c129da83	Merge branch 'feature' into feature_cpp_test	2025-12-17 14:07:04 +08:00
liangyuxuan	0f27c5cd3c	Merge branch 'feature' into feature_cpp_test	2025-12-16 17:26:29 +08:00
liangyuxuan	0c41be3154	1	2025-12-12 18:41:45 +08:00
liangyuxuan	6dafd77bcc	1	2025-12-12 16:03:12 +08:00
liangyuxuan	9fd8951fae	1	2025-12-12 15:47:41 +08:00
liangyuxuan	c5a4b5afed	1	2025-12-12 15:07:28 +08:00
liangyuxuan	6ddf95f737	1	2025-12-12 10:07:30 +08:00
liangyuxuan	360372f681	Merge branch 'feature' into feature_cpp_test	2025-12-12 10:05:49 +08:00
liangyuxuan	b264bbd6bb	1	2025-12-11 18:36:15 +08:00
liangyuxuan	ae8f7d370e	add CMakeLists	2025-12-11 18:17:47 +08:00
liangyuxuan	e13deca6b7	update	2025-12-11 15:59:32 +08:00
liangyuxuan	f8d4167542	Merge branch 'feature' into feature_cpp_test	2025-12-10 14:50:48 +08:00
liangyuxuan	8c7c38eaf4	ConfigBase类完成-CPP	2025-12-05 18:48:09 +08:00
liangyuxuan	250e2600f8	Merge branch 'feature' into feature_cpp_test	2025-12-05 14:28:49 +08:00
liangyuxuan	98483772c2	Merge branch 'feature' into feature_cpp_test	2025-12-04 11:16:19 +08:00
liangyuxuan	ed4136b36d	Merge branch 'feature' into feature_cpp_test	2025-12-04 11:12:23 +08:00
liangyuxuan	0af9c10189	Merge remote-tracking branch 'origin/feature_cpp_test' into feature_cpp_test	2025-12-03 18:30:58 +08:00
liangyuxuan	36077ceb19	Merge branch 'feature' into feature_cpp_test	2025-12-03 18:10:44 +08:00
liangyuxuan	8e13d777ff	add cpp	2025-12-03 13:58:18 +08:00